November 8th, 2024

Goal

Enhance the speed of the "Find Targets" and "View Targets" modules and fix various bugs.

Hypothesis

If the speed of the "Find Targets" and "View Targets" modules is optimized, users will experience significantly faster searches for genes, guides, and DNA sequences.

If the bugs are resolved, users will be able to operate the program without experiencing crashes.

Expected Results

Results

Bugs Fixed

Special thanks to David for stress-testing the program last week. The following issues have been resolved:

Next Steps

My next step is to address any bugs that David may encounter during testing. I’ll focus on stabilizing the modules with outstanding issues and proceed with the implementation of the Microbiome Analysis module.

Files changed (41) hide show
  1. src/controllers/FindTargetsController.py +38 -29
  2. src/controllers/HomeWindowController.py +1 -1
  3. src/controllers/MainWindowController.py +149 -147
  4. src/controllers/PopulationAnalysisWindowController.py +1 -1
  5. src/controllers/ScoringOptionsController.py +79 -0
  6. src/controllers/ViewTargetsController.py +302 -56
  7. src/models/AnnotationParser.py +196 -106
  8. src/models/CSPRparser.py +160 -67
  9. src/models/FindTargetsModel.py +85 -75
  10. src/models/GlobalSettings.py +5 -0
  11. src/models/ScoringOptionsModel.py +97 -0
  12. src/models/ViewTargetsModel.py +174 -141
  13. src/ui/new_endonuclease_window.ui +29 -35
  14. src/ui/{scoring_window.ui → scoring_options.ui} +9 -80
  15. src/ui/view_targets.ui +45 -45
  16. src/utils/LoggingMixin.py +40 -0
  17. src/utils/azimuth/__init__.py +0 -0
  18. src/utils/azimuth/features/__init__.py +0 -0
  19. src/utils/azimuth/features/featurization.py +546 -0
  20. src/utils/azimuth/load_data.py +486 -0
  21. src/utils/azimuth/model_comparison.py +716 -0
  22. src/utils/azimuth/predict.py +365 -0
  23. src/utils/azimuth/saved_models/V3_model_full.pickle +0 -0
  24. src/utils/azimuth/saved_models/V3_model_nopos.pickle +0 -0
  25. src/utils/azimuth/util.py +1331 -0
  26. src/utils/ui.py +0 -92
  27. src/views/AnnotationParser.py +0 -429
  28. src/views/CMainWindow.py +0 -987
  29. src/views/CloseableTabWidget.py +156 -0
  30. src/views/FindTargetsView.py +115 -62
  31. src/views/HomeWindowView.py +25 -4
  32. src/views/MainWindowView.py +76 -307
  33. src/views/MultitargetingWindowView.py +1 -2
  34. src/views/NewEndonuclease.py +0 -228
  35. src/views/NewGenome.py +0 -705
  36. src/views/PopulationAnalysisWindowView.py +1 -1
  37. src/views/ScoringOptionsView.py +162 -0
  38. src/views/ViewTargetsView.py +278 -55
  39. src/views/closingWin.py +0 -73
  40. src/views/export_tool.py +0 -259
  41. src/views/generateLib.py +0 -662
src/controllers/FindTargetsController.py CHANGED
@@ -2,12 +2,13 @@ from models.FindTargetsModel import FindTargetsModel
2
  from views.FindTargetsView import FindTargetsView
3
  from PyQt6.QtWidgets import QMessageBox
4
  from PyQt6.QtCore import QTimer
 
5
 
6
  class FindTargetsController:
7
  def __init__(self, global_settings):
8
  self.global_settings = global_settings
9
- self.model = None
10
- self.view = None
11
  self.organism = None
12
  self.endonuclease = None
13
  self._input_data = None
@@ -15,17 +16,23 @@ class FindTargetsController:
15
 
16
  # Connect to annotation file changes
17
  self.global_settings.annotation_file_changed.connect(self._on_annotation_file_changed)
 
18
 
19
  def _on_annotation_file_changed(self, new_annotation_file):
20
- """Handle annotation file changes by reprocessing data"""
21
  try:
22
  self.global_settings.logger.debug(f"FindTargetsController received new annotation file: {new_annotation_file}")
23
  self._current_annotation_file = new_annotation_file
24
 
25
- # Clear existing view and model
26
- self.view = None
27
- self.model = None
28
-
 
 
 
 
 
29
  except Exception as e:
30
  self.global_settings.logger.error(f"Error handling annotation file change: {str(e)}")
31
 
@@ -35,31 +42,27 @@ class FindTargetsController:
35
  self.view.push_button_view_targets.clicked.connect(self.view_targets)
36
 
37
  def find_targets(self, input_data):
38
- """Initialize view and process input data"""
39
  try:
 
 
40
  # Get current annotation file
41
  current_annotation = self.global_settings.get_current_annotation_file()
42
  input_data['annotation_file'] = current_annotation
43
  self._current_annotation_file = current_annotation
 
44
 
45
- # Always create new instances
46
- self.model = FindTargetsModel(self.global_settings)
47
- self.view = FindTargetsView(self.global_settings)
48
- self._connect_signals()
49
-
50
- self._input_data = input_data
51
 
52
- # Find existing Find Targets tab
53
  main_window = self.global_settings.main_window
54
  existing_tab = main_window.find_tab_by_title("Find Targets")
 
 
55
 
56
- if existing_tab:
57
- # Remove the existing tab
58
- tab_index = main_window.view.tab_widget.indexOf(existing_tab)
59
- main_window.view.tab_widget.removeTab(tab_index)
60
-
61
- # Process data and create new tab
62
- self._process_input_data(input_data)
63
 
64
  except Exception as e:
65
  self.global_settings.logger.error(f"Error in find_targets: {str(e)}")
@@ -68,24 +71,28 @@ class FindTargetsController:
68
  def _process_input_data(self, input_data):
69
  """Process input data and update view"""
70
  try:
71
- if not self.view:
72
- return
73
-
74
  self.global_settings.logger.debug(f"FindTargetsController processing input data: {input_data}")
75
  self.organism = input_data['organism']
76
  self.endonuclease = input_data['endonuclease']
77
 
78
  # Get new results
 
79
  results = self.model.find_targets(input_data)
 
 
80
  self.global_settings.logger.debug(f"Found {len(results) if results else 0} targets")
81
 
82
  # Update view with new results
 
83
  if results:
84
  self.view.display_results(results)
 
 
85
 
86
- # Add new tab with updated view
87
- main_window = self.global_settings.main_window
88
- main_window.open_new_tab("Find Targets", self)
89
 
90
  except Exception as e:
91
  self.global_settings.logger.error(f"Error processing input data: {str(e)}")
@@ -93,12 +100,14 @@ class FindTargetsController:
93
  QMessageBox.critical(self.view, "Error", f"An error occurred while processing data: {str(e)}")
94
 
95
  def view_targets(self):
96
- """Handle view targets button click"""
97
  try:
98
  if not self.view:
99
  return
100
 
101
  selected_targets = self.view.get_selected_targets()
 
 
 
102
  if not selected_targets:
103
  QMessageBox.warning(self.view, "No Selection", "Please select targets to view.")
104
  return
 
2
  from views.FindTargetsView import FindTargetsView
3
  from PyQt6.QtWidgets import QMessageBox
4
  from PyQt6.QtCore import QTimer
5
+ import time
6
 
7
  class FindTargetsController:
8
  def __init__(self, global_settings):
9
  self.global_settings = global_settings
10
+ self.model = FindTargetsModel(self.global_settings)
11
+ self.view = FindTargetsView(self.global_settings)
12
  self.organism = None
13
  self.endonuclease = None
14
  self._input_data = None
 
16
 
17
  # Connect to annotation file changes
18
  self.global_settings.annotation_file_changed.connect(self._on_annotation_file_changed)
19
+ self._connect_signals()
20
 
21
  def _on_annotation_file_changed(self, new_annotation_file):
22
+ """Handle annotation file changes by clearing and updating results"""
23
  try:
24
  self.global_settings.logger.debug(f"FindTargetsController received new annotation file: {new_annotation_file}")
25
  self._current_annotation_file = new_annotation_file
26
 
27
+ # Clear the current results
28
+ if self.view and hasattr(self.view, 'results_table'):
29
+ self.view.clear_results()
30
+
31
+ # If we have previous input data, rerun the search with the new annotation file
32
+ if self._input_data:
33
+ self._input_data['annotation_file'] = new_annotation_file
34
+ self._process_input_data(self._input_data)
35
+
36
  except Exception as e:
37
  self.global_settings.logger.error(f"Error handling annotation file change: {str(e)}")
38
 
 
42
  self.view.push_button_view_targets.clicked.connect(self.view_targets)
43
 
44
  def find_targets(self, input_data):
45
+ """Process input data and update existing view or create new one"""
46
  try:
47
+ start_time = time.time()
48
+
49
  # Get current annotation file
50
  current_annotation = self.global_settings.get_current_annotation_file()
51
  input_data['annotation_file'] = current_annotation
52
  self._current_annotation_file = current_annotation
53
+ self._input_data = input_data.copy() # Store a copy of the input data
54
 
55
+ # Process data and update view
56
+ self._process_input_data(input_data)
 
 
 
 
57
 
58
+ # If there's no existing tab, create one
59
  main_window = self.global_settings.main_window
60
  existing_tab = main_window.find_tab_by_title("Find Targets")
61
+ if not existing_tab:
62
+ main_window.open_new_tab("Find Targets", self)
63
 
64
+ total_time = time.time() - start_time
65
+ self.global_settings.logger.debug(f"Total time to process find targets: {total_time:.2f} seconds")
 
 
 
 
 
66
 
67
  except Exception as e:
68
  self.global_settings.logger.error(f"Error in find_targets: {str(e)}")
 
71
  def _process_input_data(self, input_data):
72
  """Process input data and update view"""
73
  try:
74
+ start_time = time.time()
75
+
 
76
  self.global_settings.logger.debug(f"FindTargetsController processing input data: {input_data}")
77
  self.organism = input_data['organism']
78
  self.endonuclease = input_data['endonuclease']
79
 
80
  # Get new results
81
+ search_start = time.time()
82
  results = self.model.find_targets(input_data)
83
+ search_time = time.time() - search_start
84
+ self.global_settings.logger.debug(f"Time to search: {search_time:.2f} seconds")
85
  self.global_settings.logger.debug(f"Found {len(results) if results else 0} targets")
86
 
87
  # Update view with new results
88
+ view_start = time.time()
89
  if results:
90
  self.view.display_results(results)
91
+ view_time = time.time() - view_start
92
+ self.global_settings.logger.debug(f"Time to update view: {view_time:.2f} seconds")
93
 
94
+ total_time = time.time() - start_time
95
+ self.global_settings.logger.debug(f"Total time to process data: {total_time:.2f} seconds")
 
96
 
97
  except Exception as e:
98
  self.global_settings.logger.error(f"Error processing input data: {str(e)}")
 
100
  QMessageBox.critical(self.view, "Error", f"An error occurred while processing data: {str(e)}")
101
 
102
  def view_targets(self):
 
103
  try:
104
  if not self.view:
105
  return
106
 
107
  selected_targets = self.view.get_selected_targets()
108
+ print(f"Selected targets: {selected_targets}")
109
+ print(f"Organism: {self.organism}")
110
+ print(f"Endonuclease: {self.endonuclease}")
111
  if not selected_targets:
112
  QMessageBox.warning(self.view, "No Selection", "Please select targets to view.")
113
  return
src/controllers/HomeWindowController.py CHANGED
@@ -3,7 +3,7 @@ from PyQt6 import QtWidgets, QtCore, uic
3
  from PyQt6.QtWidgets import QMainWindow
4
  from views.HomeWindowView import HomeWindowView
5
  from models.HomeWindowModel import HomeWindowModel
6
- from utils.ui import show_error, show_message, scale_ui, center_ui, position_window
7
  from PyQt6.QtCore import QObject
8
  from controllers.FindTargetsController import FindTargetsController
9
 
 
3
  from PyQt6.QtWidgets import QMainWindow
4
  from views.HomeWindowView import HomeWindowView
5
  from models.HomeWindowModel import HomeWindowModel
6
+ from utils.ui import show_error, show_message
7
  from PyQt6.QtCore import QObject
8
  from controllers.FindTargetsController import FindTargetsController
9
 
src/controllers/MainWindowController.py CHANGED
@@ -1,83 +1,77 @@
1
- import os
2
- from PyQt6 import QtWidgets, QtCore, uic, QtGui
3
- from PyQt6.QtWidgets import QMainWindow, QWidget, QVBoxLayout, QHBoxLayout
4
  from views.MainWindowView import MainWindowView
5
  from models.MainWindowModel import MainWindowModel
6
- from controllers.MultitargetingWindowController import MultitargetingWindowController
7
- from utils.ui import show_error, show_message, scale_ui, center_ui, position_window
8
  from utils.web import ncbi_page, repo_page, ncbi_blast_page
9
- from PyQt6.QtCore import QObject, Qt
10
- import qdarktheme
11
  from PyQt6.QtCore import QSize
 
12
 
13
- class MainWindowController:
14
  def __init__(self, global_settings):
 
15
  self.global_settings = global_settings
16
- self.logger = global_settings.get_logger()
17
- self.tab_widgets = {} # Store references to tab widgets
 
 
18
  self.startup_controller = None
19
  self.is_first_time_startup = self.global_settings.is_first_time_startup
20
-
21
- # Single shared size for all regular tabs
22
  self.shared_tab_size = QSize(850, 850)
23
- # Separate size only for startup
24
  self.startup_size = QSize(750, 550)
25
-
26
  self.current_tab = None
27
 
28
  try:
29
  self.view = MainWindowView(global_settings)
30
  self._setup_connections()
31
  self._init_ui()
32
-
33
- # Check and emit first_time_startup signal after initialization
34
  self.global_settings.check_and_emit_first_time_startup()
35
  except Exception as e:
 
36
  show_error(self.global_settings, "Error initializing MainWindowController", str(e))
37
 
38
  def _setup_connections(self):
39
- try:
40
- # menuBar
41
- self.view.action_change_database_directory.triggered.connect(self._change_database_directory)
42
- # self.view.action_open_genome_browser.triggered.connect(self.open_genome_browser)
43
- self.view.action_open_repository.triggered.connect(self._open_repository_website)
44
- self.view.action_open_NCBI_BLAST.triggered.connect(self._open_ncbi_blast_website)
45
- self.view.action_open_NCBI.triggered.connect(self._open_ncbi_website)
46
-
47
- # Title Bar
48
- self.view.close_window_button.clicked.connect(self._close_window)
49
- self.view.minimize_window_button.clicked.connect(self._minimize_window)
50
- self.view.maximize_window_button.clicked.connect(self._maximize_window)
51
- self.view.theme_toggle_button.clicked.connect(self._toggle_theme)
52
 
53
- # Tab bar
54
- self.view.tab_widget.tab_closed.connect(self._on_tab_closed)
55
- self.view.tab_widget.tabCloseRequested.connect(self._close_tab)
 
 
56
 
57
- self.global_settings.first_time_startup.connect(self._handle_first_time_startup)
 
 
58
 
59
- except Exception as e:
60
- show_error(self.global_settings, "Error setting up connections in MainWindowController", str(e))
61
 
62
  def _init_ui(self):
63
- try:
64
- if self.is_first_time_startup:
65
- self.logger.info("First time startup detected in _init_ui. Opening startup tab.")
66
- self._open_startup_tab()
67
- else:
68
- db_path = self.global_settings.get_db_path()
69
- is_valid, message = self.global_settings.validate_db_path(db_path)
70
- if db_path and is_valid:
71
- self.logger.info(f"Database path is set and valid: {db_path}")
72
- self._open_home_tab()
73
- else:
74
- self.logger.info(f"Database path is not set or invalid: {db_path}. {message}")
75
- self._open_startup_tab()
76
- except Exception as e:
77
- show_error(self.global_settings, "Error initializing UI in MainWindowController", str(e))
 
78
 
79
  def _handle_first_time_startup(self):
80
- self.logger.info("First time startup signal received. Opening startup tab.")
81
  self.is_first_time_startup = True
82
  self._open_startup_tab()
83
 
@@ -86,30 +80,25 @@ class MainWindowController:
86
  self.startup_controller = self.global_settings.get_startup_window()
87
  self.open_new_tab("Startup", self.startup_controller)
88
  except Exception as e:
 
89
  show_error(self.global_settings, "Error opening startup tab", str(e))
90
 
91
  def _switch_to_home_from_startup(self):
92
- try:
93
- self.logger.debug("Switching to home from startup")
94
- # Find the startup tab
95
- startup_tab = self.find_tab_by_title("Startup")
96
- if startup_tab:
97
- index = self.view.tab_widget.indexOf(startup_tab)
98
- self._close_tab(index)
99
- self.logger.debug(f"Closed startup tab at index {index}")
100
-
101
- # Deactivate the startup controller
102
- if self.startup_controller:
103
- self.startup_controller.deactivate()
104
- self.startup_controller = None
105
- else:
106
- self.logger.warning("Startup tab not found when trying to close it")
107
 
108
- self.close_new_genome_and_switch_to_home()
109
- self._center_window() # Center the window after switching to home
110
- except Exception as e:
111
- self.logger.error(f"Error switching to home from startup: {str(e)}", exc_info=True)
112
- show_error(self.global_settings, "Error switching to home tab", str(e))
113
 
114
  def _center_window(self):
115
  try:
@@ -117,43 +106,32 @@ class MainWindowController:
117
  frame_geometry = self.view.frameGeometry()
118
  frame_geometry.moveCenter(center_point)
119
  self.view.move(frame_geometry.topLeft())
120
- self.logger.debug(f"Centered window. New position: {self.view.pos()}")
121
  except Exception as e:
122
- self.logger.error(f"Error centering window: {str(e)}", exc_info=True)
123
  show_error(self.global_settings, "Error centering window", str(e))
124
 
125
- def _open_home_tab(self):
126
- try:
127
- home_controller = self.global_settings.get_home_window()
128
- self.open_new_tab("Home", home_controller)
129
- except Exception as e:
130
- show_error(self.global_settings, "Error opening home tab", str(e))
131
-
132
  def _change_database_directory(self):
133
  try:
134
  new_directory = QtWidgets.QFileDialog.getExistingDirectory(
135
- self.view, "Select Database Directory", self.global_settings.get_db_path(),
 
136
  QtWidgets.QFileDialog.Option.ShowDirsOnly
137
  )
138
- if new_directory:
139
- is_valid, message = self.global_settings.validate_db_path(new_directory)
140
- if is_valid:
141
- self._process_valid_directory(new_directory)
142
- else:
143
- self._handle_invalid_directory(new_directory, message)
 
 
 
 
144
  except Exception as e:
145
- self.logger.error(f"Error changing database directory: {str(e)}", exc_info=True)
146
  show_error(self.global_settings, "Error changing database directory", str(e))
147
 
148
- def _process_valid_directory(self, new_directory):
149
- self.global_settings.save_db_path(new_directory)
150
- self.global_settings.update_db_state()
151
- show_message("Success", "Database directory changed successfully.")
152
-
153
- # If we're currently on the startup tab, switch to the home tab
154
- if self.startup_controller and self.view.tab_widget.currentWidget() == self.startup_controller.view:
155
- self._switch_to_home_from_startup()
156
-
157
  def _handle_invalid_directory(self, new_directory, message):
158
  reply = QtWidgets.QMessageBox.question(
159
  self.view,
@@ -171,6 +149,19 @@ class MainWindowController:
171
  else:
172
  show_message("Operation Cancelled", "Database directory change cancelled.")
173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  def _open_ncbi_website(self):
175
  ncbi_page()
176
 
@@ -194,53 +185,69 @@ class MainWindowController:
194
 
195
  def _on_tab_closed(self, widget):
196
  """
197
- Handle the tab_closed signal by removing references to the deleted widget.
198
  """
199
- # Iterate through the tab_widgets to find and remove the closed widget
200
- for title, tab_widget in list(self.tab_widgets.items()):
201
- if tab_widget == widget:
202
- self.logger.info(f"Tab '{title}' closed. Dereferencing the widget.")
203
- del self.tab_widgets[title]
204
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
  def open_new_tab(self, title, content):
 
207
  try:
208
- self.logger.debug(f"Attempting to open new tab: {title}")
209
 
210
  # Check if the tab already exists
211
  existing_tab = self.find_tab_by_title(title)
212
  if existing_tab:
213
- self.logger.debug(f"Tab '{title}' already exists, switching to it")
214
  self.view.tab_widget.setCurrentWidget(existing_tab)
215
  self._resize_for_tab(title)
216
  return
217
 
218
- # If the tab doesn't exist, create a new one
219
  if hasattr(content, 'view'):
220
  widget = content.view
 
 
221
  else:
222
  widget = content
223
 
224
- # Create a wrapper widget with padding
225
  wrapper = QWidget()
226
  layout = QVBoxLayout(wrapper)
227
  layout.setContentsMargins(10, 10, 10, 10)
228
  layout.addWidget(widget)
229
 
230
- # Add the wrapper to the tab widget
231
  index = self.view.tab_widget.addTab(wrapper, title)
232
  self.view.tab_widget.setCurrentIndex(index)
233
- self.tab_widgets[title] = wrapper
234
 
235
  self._resize_for_tab(title)
236
-
237
- self.logger.info(f"Opened new tab '{title}' at index {index}")
238
  except Exception as e:
239
- self.logger.error(f"Error opening tab '{title}': {str(e)}", exc_info=True)
240
  show_error(self.global_settings, f"Error opening tab '{title}'", str(e))
241
 
242
- self.view.tab_widget.currentChanged.connect(self._on_tab_changed)
243
-
244
  def _resize_for_tab(self, title):
245
  if title == "Startup":
246
  # For Startup tab, set fixed size and disable maximize button
@@ -251,7 +258,10 @@ class MainWindowController:
251
  self.view.setMinimumSize(QSize(400, 300))
252
  self.view.setMaximumSize(QtCore.QSize(16777215, 16777215))
253
  self.view.setWindowFlags(self.view.windowFlags() | Qt.WindowType.WindowMaximizeButtonHint)
254
- self.view.resize(self.shared_tab_size)
 
 
 
255
 
256
  # Ensure window flags are updated
257
  self.view.show()
@@ -260,30 +270,35 @@ class MainWindowController:
260
  self.current_tab = title
261
 
262
  def _close_tab(self, index):
 
 
 
263
  if 0 <= index < self.view.tab_widget.count():
264
- widget = self.view.tab_widget.widget(index)
265
  title = self.view.tab_widget.tabText(index)
266
- self.view.tab_widget.removeTab(index)
267
- if widget:
268
- widget.deleteLater()
269
- # Remove the tab from our tab_widgets dictionary
270
- if title in self.tab_widgets:
271
- del self.tab_widgets[title]
 
 
 
 
272
  self.logger.debug(f"Closed tab '{title}' at index {index}")
273
 
274
- # If we're closing the New Genome tab and Home tab exists, refresh it
275
  if title == "New Genome":
276
  home_tab = self.find_tab_by_title("Home")
277
  if home_tab:
278
  home_controller = self.global_settings.get_home_window()
279
  home_controller.refresh_data()
280
- else:
281
- self.logger.warning(f"Attempted to close non-existent tab at index {index}")
282
 
283
- if self.view.tab_widget.count() > 0:
284
- new_index = self.view.tab_widget.currentIndex()
285
- new_tab_title = self.view.tab_widget.tabText(new_index)
286
- self._resize_for_tab(new_tab_title)
 
287
 
288
  def _toggle_theme(self):
289
  try:
@@ -299,7 +314,8 @@ class MainWindowController:
299
  if saved_position:
300
  self.view.move(saved_position)
301
  else:
302
- center_ui(self.view)
 
303
  self.view.show()
304
  self.view.apply_theme()
305
  except Exception as e:
@@ -332,22 +348,8 @@ class MainWindowController:
332
  self.logger.debug(f"Window geometry after opening New Genome tab: {self.view.geometry()}")
333
 
334
  def find_tab_by_title(self, title):
335
- for i in range(self.view.tab_widget.count()):
336
- if self.view.tab_widget.tabText(i) == title:
337
- return self.view.tab_widget.widget(i)
338
- return None
339
-
340
- def _on_tab_changed(self, index):
341
- # Save the current size before switching if it's not the startup tab
342
- if self.current_tab and self.current_tab != "Startup":
343
- current_size = self.view.size()
344
- if current_size.width() >= 400 and current_size.height() >= 300:
345
- # Update shared size for all non-startup tabs
346
- self.shared_tab_size = current_size
347
-
348
- # Get the new tab title and resize
349
- new_tab_title = self.view.tab_widget.tabText(index)
350
- self._resize_for_tab(new_tab_title)
351
 
352
  def close_new_genome_and_switch_to_home(self):
353
  try:
 
1
+ from PyQt6 import QtWidgets, QtCore, QtGui
2
+ from PyQt6.QtWidgets import QWidget, QVBoxLayout
 
3
  from views.MainWindowView import MainWindowView
4
  from models.MainWindowModel import MainWindowModel
5
+ from utils.ui import show_error, show_message
 
6
  from utils.web import ncbi_page, repo_page, ncbi_blast_page
7
+ from PyQt6.QtCore import Qt
 
8
  from PyQt6.QtCore import QSize
9
+ from utils.LoggingMixin import LoggingMixin
10
 
11
+ class MainWindowController(LoggingMixin):
12
  def __init__(self, global_settings):
13
+ LoggingMixin.__init__(self)
14
  self.global_settings = global_settings
15
+ self.tab_widgets = {
16
+ 'widgets': {},
17
+ 'controllers': {}
18
+ }
19
  self.startup_controller = None
20
  self.is_first_time_startup = self.global_settings.is_first_time_startup
 
 
21
  self.shared_tab_size = QSize(850, 850)
 
22
  self.startup_size = QSize(750, 550)
 
23
  self.current_tab = None
24
 
25
  try:
26
  self.view = MainWindowView(global_settings)
27
  self._setup_connections()
28
  self._init_ui()
 
 
29
  self.global_settings.check_and_emit_first_time_startup()
30
  except Exception as e:
31
+ self.log_error("__init__", e)
32
  show_error(self.global_settings, "Error initializing MainWindowController", str(e))
33
 
34
  def _setup_connections(self):
35
+ self.log_method_call("_setup_connections")
36
+
37
+ # menuBar
38
+ self.view.action_change_database_directory.triggered.connect(self._change_database_directory)
39
+ self.view.action_open_repository.triggered.connect(self._open_repository_website)
40
+ self.view.action_open_NCBI_BLAST.triggered.connect(self._open_ncbi_blast_website)
41
+ self.view.action_open_NCBI.triggered.connect(self._open_ncbi_website)
 
 
 
 
 
 
42
 
43
+ # Title Bar
44
+ self.view.close_window_button.clicked.connect(self._close_window)
45
+ self.view.minimize_window_button.clicked.connect(self._minimize_window)
46
+ self.view.maximize_window_button.clicked.connect(self._maximize_window)
47
+ self.view.theme_toggle_button.clicked.connect(self._toggle_theme)
48
 
49
+ # Tab bar
50
+ self.view.tab_widget.tab_closed.connect(self._on_tab_closed)
51
+ self.view.tab_widget.tabCloseRequested.connect(self._close_tab)
52
 
53
+ self.global_settings.first_time_startup.connect(self._handle_first_time_startup)
 
54
 
55
  def _init_ui(self):
56
+ self.log_method_call("_init_ui")
57
+
58
+ if self.is_first_time_startup:
59
+ self.log_info("First time startup detected. Opening startup tab.")
60
+ self._open_startup_tab()
61
+ return
62
+
63
+ db_path = self.global_settings.get_db_path()
64
+ is_valid, message = self.global_settings.validate_db_path(db_path)
65
+
66
+ if db_path and is_valid:
67
+ self.log_info(f"Database path is valid: {db_path}")
68
+ self._open_home_tab()
69
+ else:
70
+ self.log_warning(f"Invalid database path: {db_path}. {message}")
71
+ self._open_startup_tab()
72
 
73
  def _handle_first_time_startup(self):
74
+ self.log_info("First time startup signal received")
75
  self.is_first_time_startup = True
76
  self._open_startup_tab()
77
 
 
80
  self.startup_controller = self.global_settings.get_startup_window()
81
  self.open_new_tab("Startup", self.startup_controller)
82
  except Exception as e:
83
+ self.log_error("_open_startup_tab", e)
84
  show_error(self.global_settings, "Error opening startup tab", str(e))
85
 
86
  def _switch_to_home_from_startup(self):
87
+ self.log_method_call("_switch_to_home_from_startup")
88
+
89
+ startup_tab = self.find_tab_by_title("Startup")
90
+ if startup_tab:
91
+ index = self.view.tab_widget.indexOf(startup_tab)
92
+ self._close_tab(index)
93
+
94
+ if self.startup_controller:
95
+ self.startup_controller.deactivate()
96
+ self.startup_controller = None
97
+ else:
98
+ self.log_warning("Startup tab not found when trying to close it")
 
 
 
99
 
100
+ self.close_new_genome_and_switch_to_home()
101
+ self._center_window()
 
 
 
102
 
103
  def _center_window(self):
104
  try:
 
106
  frame_geometry = self.view.frameGeometry()
107
  frame_geometry.moveCenter(center_point)
108
  self.view.move(frame_geometry.topLeft())
109
+ self.log_debug(f"Window centered at {self.view.pos()}")
110
  except Exception as e:
111
+ self.log_error("_center_window", e)
112
  show_error(self.global_settings, "Error centering window", str(e))
113
 
 
 
 
 
 
 
 
114
  def _change_database_directory(self):
115
  try:
116
  new_directory = QtWidgets.QFileDialog.getExistingDirectory(
117
+ self.view, "Select Database Directory",
118
+ self.global_settings.get_db_path(),
119
  QtWidgets.QFileDialog.Option.ShowDirsOnly
120
  )
121
+
122
+ if not new_directory:
123
+ return
124
+
125
+ is_valid, message = self.global_settings.validate_db_path(new_directory)
126
+ if is_valid:
127
+ self._process_valid_directory(new_directory)
128
+ else:
129
+ self._handle_invalid_directory(new_directory, message)
130
+
131
  except Exception as e:
132
+ self.log_error("_change_database_directory", e)
133
  show_error(self.global_settings, "Error changing database directory", str(e))
134
 
 
 
 
 
 
 
 
 
 
135
  def _handle_invalid_directory(self, new_directory, message):
136
  reply = QtWidgets.QMessageBox.question(
137
  self.view,
 
149
  else:
150
  show_message("Operation Cancelled", "Database directory change cancelled.")
151
 
152
+ def _process_valid_directory(self, new_directory):
153
+ try:
154
+ self.global_settings.save_db_path(new_directory)
155
+ self.global_settings.update_db_state()
156
+ show_message("Success", "Database directory changed successfully.")
157
+
158
+ if (self.startup_controller and
159
+ self.view.tab_widget.currentWidget() == self.startup_controller.view):
160
+ self._switch_to_home_from_startup()
161
+ except Exception as e:
162
+ self.log_error("_process_valid_directory", e)
163
+ show_error(self.global_settings, "Error processing directory", str(e))
164
+
165
  def _open_ncbi_website(self):
166
  ncbi_page()
167
 
 
185
 
186
  def _on_tab_closed(self, widget):
187
  """
188
+ Handle the tab_closed signal from CloseableTabWidget
189
  """
190
+ try:
191
+ # Remove references from both widgets and controllers dictionaries
192
+ for title in list(self.tab_widgets['widgets'].keys()):
193
+ if self.tab_widgets['widgets'][title] == widget:
194
+ self.logger.info(f"Tab '{title}' closed. Cleaning up references.")
195
+ del self.tab_widgets['widgets'][title]
196
+ if title in self.tab_widgets['controllers']:
197
+ del self.tab_widgets['controllers'][title]
198
+ break
199
+ except Exception as e:
200
+ self.logger.error(f"Error in _on_tab_closed: {str(e)}")
201
+
202
+ def _open_home_tab(self):
203
+ """Opens the home tab"""
204
+ try:
205
+ home_controller = self.global_settings.get_home_window()
206
+ self.open_new_tab("Home", home_controller)
207
+ self.log_info("Home tab opened successfully")
208
+ except Exception as e:
209
+ self.log_error("_open_home_tab", e)
210
+ show_error(self.global_settings, "Error opening home tab", str(e))
211
 
212
  def open_new_tab(self, title, content):
213
+ """Opens a new tab with the given title and content"""
214
  try:
215
+ self.log_debug(f"Opening new tab: {title}")
216
 
217
  # Check if the tab already exists
218
  existing_tab = self.find_tab_by_title(title)
219
  if existing_tab:
220
+ self.log_debug(f"Tab '{title}' already exists, switching to it")
221
  self.view.tab_widget.setCurrentWidget(existing_tab)
222
  self._resize_for_tab(title)
223
  return
224
 
225
+ # Create widget from content
226
  if hasattr(content, 'view'):
227
  widget = content.view
228
+ # Store controller reference
229
+ self.tab_widgets['controllers'][title] = content
230
  else:
231
  widget = content
232
 
233
+ # Create wrapper widget with padding
234
  wrapper = QWidget()
235
  layout = QVBoxLayout(wrapper)
236
  layout.setContentsMargins(10, 10, 10, 10)
237
  layout.addWidget(widget)
238
 
239
+ # Add the wrapper to the tab widget and store reference
240
  index = self.view.tab_widget.addTab(wrapper, title)
241
  self.view.tab_widget.setCurrentIndex(index)
242
+ self.tab_widgets['widgets'][title] = wrapper
243
 
244
  self._resize_for_tab(title)
245
+ self.log_info(f"Tab '{title}' opened successfully at index {index}")
246
+
247
  except Exception as e:
248
+ self.log_error("open_new_tab", e)
249
  show_error(self.global_settings, f"Error opening tab '{title}'", str(e))
250
 
 
 
251
  def _resize_for_tab(self, title):
252
  if title == "Startup":
253
  # For Startup tab, set fixed size and disable maximize button
 
258
  self.view.setMinimumSize(QSize(400, 300))
259
  self.view.setMaximumSize(QtCore.QSize(16777215, 16777215))
260
  self.view.setWindowFlags(self.view.windowFlags() | Qt.WindowType.WindowMaximizeButtonHint)
261
+
262
+ # Only resize if coming from Startup tab or if no current size is set
263
+ if self.current_tab == "Startup" or self.view.size() == self.startup_size:
264
+ self.view.resize(self.shared_tab_size)
265
 
266
  # Ensure window flags are updated
267
  self.view.show()
 
270
  self.current_tab = title
271
 
272
  def _close_tab(self, index):
273
+ """
274
+ Handle tab closure using CloseableTabWidget
275
+ """
276
  if 0 <= index < self.view.tab_widget.count():
 
277
  title = self.view.tab_widget.tabText(index)
278
+
279
+ # Let CloseableTabWidget handle the widget cleanup
280
+ self.view.tab_widget.closeTab(index)
281
+
282
+ # Clean up our references
283
+ if title in self.tab_widgets['widgets']:
284
+ del self.tab_widgets['widgets'][title]
285
+ if title in self.tab_widgets['controllers']:
286
+ del self.tab_widgets['controllers'][title]
287
+
288
  self.logger.debug(f"Closed tab '{title}' at index {index}")
289
 
290
+ # Handle post-close operations
291
  if title == "New Genome":
292
  home_tab = self.find_tab_by_title("Home")
293
  if home_tab:
294
  home_controller = self.global_settings.get_home_window()
295
  home_controller.refresh_data()
 
 
296
 
297
+ # Resize for the current tab
298
+ if self.view.tab_widget.count() > 0:
299
+ new_index = self.view.tab_widget.currentIndex()
300
+ new_tab_title = self.view.tab_widget.tabText(new_index)
301
+ self._resize_for_tab(new_tab_title)
302
 
303
  def _toggle_theme(self):
304
  try:
 
314
  if saved_position:
315
  self.view.move(saved_position)
316
  else:
317
+ # center_ui(self.view)
318
+ pass
319
  self.view.show()
320
  self.view.apply_theme()
321
  except Exception as e:
 
348
  self.logger.debug(f"Window geometry after opening New Genome tab: {self.view.geometry()}")
349
 
350
  def find_tab_by_title(self, title):
351
+ """Find a tab by its title"""
352
+ return self.tab_widgets['widgets'].get(title)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
 
354
  def close_new_genome_and_switch_to_home(self):
355
  try:
src/controllers/PopulationAnalysisWindowController.py CHANGED
@@ -1,5 +1,5 @@
1
  from PyQt6 import QtWidgets
2
- from utils.ui import show_error, show_message, position_window
3
  from views.PopulationAnalysisWindowView import PopulationAnalysisWindowView
4
  from models.PopulationAnalysisWindowModel import PopulationAnalysisWindowModel
5
  import logging
 
1
  from PyQt6 import QtWidgets
2
+ from utils.ui import show_error, show_message
3
  from views.PopulationAnalysisWindowView import PopulationAnalysisWindowView
4
  from models.PopulationAnalysisWindowModel import PopulationAnalysisWindowModel
5
  import logging
src/controllers/ScoringOptionsController.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from models.ScoringOptionsModel import ScoringOptionsModel
2
+ from views.ScoringOptionsView import ScoringOptionsView
3
+
4
+ class ScoringOptionsController:
5
+ def __init__(self, global_settings, view_targets_controller):
6
+ self.global_settings = global_settings
7
+ self.view_targets_controller = view_targets_controller
8
+ self.model = ScoringOptionsModel(global_settings)
9
+ self.view = ScoringOptionsView(global_settings)
10
+
11
+ # Connect signals
12
+ self.view.fasta_selected.connect(self._on_fasta_selected)
13
+ self.view.submit_clicked.connect(self._on_submit)
14
+
15
+ def show(self):
16
+ """Show the scoring options window"""
17
+ self.view.show()
18
+
19
+ def _on_fasta_selected(self, fasta_path):
20
+ """Handle FASTA file selection"""
21
+ # Get current chromosome from view targets
22
+ current_gene = self.view_targets_controller.view.combo_box_gene.currentText()
23
+ locus_tag = current_gene.split(': ')[0] if ': ' in current_gene else current_gene
24
+ print(f"Getting gene data for locus tag: {locus_tag}")
25
+ gene_data = self.view_targets_controller.model.get_gene_data(locus_tag)
26
+
27
+ if not gene_data or 'info' not in gene_data:
28
+ self.view.show_error("Error", "Could not get chromosome information for current gene")
29
+ return
30
+
31
+ # Load FASTA file
32
+ success = self.model.load_fasta(fasta_path, gene_data['info']['chromosome'])
33
+ if not success:
34
+ self.view.show_error("Error", "Failed to load FASTA file")
35
+ return
36
+
37
+ def _on_submit(self):
38
+ """Handle submit button click"""
39
+ try:
40
+ # Validate inputs
41
+ fasta_path = self.view.get_fasta_path()
42
+ if not fasta_path:
43
+ self.view.show_error("Error", "Please select a FASTA file")
44
+ return
45
+
46
+ algorithm = self.view.get_selected_algorithm()
47
+ if not algorithm:
48
+ self.view.show_error("Error", "Please select a scoring algorithm")
49
+ return
50
+
51
+ # Get selected targets from view targets
52
+ selected_targets = self.view_targets_controller.view.get_selected_targets()
53
+ if not selected_targets:
54
+ self.view.show_error("Error", "No targets selected")
55
+ return
56
+
57
+ # Score sequences
58
+ scores, reject_list, guide_list = self.model.score_sequences(selected_targets, algorithm)
59
+
60
+ if scores is None:
61
+ self.view.show_error("Error", "Failed to score sequences")
62
+ return
63
+
64
+ # Report rejected sequences
65
+ if reject_list:
66
+ rejected_seqs = "\n".join([guide_list[i] for i in reject_list])
67
+ self.view.show_info(
68
+ "Sequences Not Found",
69
+ f"The following sequences were not found and scored as -1:\n{rejected_seqs}"
70
+ )
71
+
72
+ # Update scores in view targets
73
+ self.view_targets_controller.update_scores(scores, algorithm)
74
+
75
+ self.view.close()
76
+
77
+ except Exception as e:
78
+ self.global_settings.logger.error(f"Error in scoring submission: {str(e)}")
79
+ self.view.show_error("Error", f"Error processing scores: {str(e)}")
src/controllers/ViewTargetsController.py CHANGED
@@ -1,11 +1,14 @@
1
  import logging
 
2
  from models.ViewTargetsModel import ViewTargetsModel
3
  from views.ViewTargetsView import ViewTargetsView
4
  from PyQt6.QtWidgets import QMessageBox
5
  from utils.ui import show_error
6
  import time
 
7
  import traceback
8
  import threading
 
9
 
10
  class ViewTargetsController:
11
  def __init__(self, global_settings):
@@ -31,6 +34,7 @@ class ViewTargetsController:
31
  self.view.push_button_reset_location.clicked.connect(self.reset_location)
32
  self.view.check_box_select_all.stateChanged.connect(self.select_all)
33
  self.view.combo_box_gene.currentIndexChanged.connect(self.display_gene_data)
 
34
 
35
  def load_targets(self, selected_targets, organism, endonuclease):
36
  try:
@@ -51,6 +55,43 @@ class ViewTargetsController:
51
  targets_time = time.time() - targets_start
52
  self.global_settings.logger.debug(f"Getting targets took: {targets_time:.2f} seconds")
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  # Time displaying targets
55
  display_start = time.time()
56
  self.view.display_targets_in_table(targets)
@@ -76,44 +117,54 @@ class ViewTargetsController:
76
  show_error(self.global_settings, "Error loading targets", str(e))
77
 
78
  def load_gene_viewer(self):
 
79
  try:
80
- start_time = time.time()
81
-
82
- # Get available genes from the model
83
- genes = self.model.get_available_genes()
84
- if genes:
85
- # Update the gene combo box
86
- self.view.combo_box_gene.clear()
87
- self.view.combo_box_gene.addItems(genes)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
- # Fetch first gene immediately
90
- first_gene = genes[0]
91
- gene_data = self.model.get_gene_data(first_gene)
 
 
 
92
 
93
- if gene_data:
94
- # Update the gene viewer with sequence
95
- self.view.set_text_edit_gene_viewer(gene_data['sequence'])
96
-
97
- # Update location fields if available
98
- if 'info' in gene_data and 'feature_location' in gene_data['info']:
99
- location = gene_data['info']['feature_location']
100
- if ':' in location:
101
- start, end = location.split(':')[0], location.split(':')[1].split('(')[0]
102
- self.view.line_edit_start_location.setText(start)
103
- self.view.line_edit_stop_location.setText(end)
104
-
105
- # Pre-fetch next few genes in background thread
106
- def prefetch_genes():
107
- for gene in genes[1:5]: # Pre-fetch next 4 genes
108
- self.model.get_gene_data(gene)
109
-
110
- threading.Thread(target=prefetch_genes, daemon=True).start()
111
 
112
- execution_time = time.time() - start_time
113
- self.global_settings.logger.debug(f"Loading gene viewer took: {execution_time:.2f} seconds")
114
-
115
  except Exception as e:
116
- self.global_settings.logger.error(f"Error in load_gene_viewer: {str(e)}\n{traceback.format_exc()}")
 
117
 
118
  def perform_off_target_analysis(self):
119
  try:
@@ -142,6 +193,7 @@ class ViewTargetsController:
142
  show_error(self.global_settings, "Error in cotargeting", str(e))
143
 
144
  def highlight_gene_viewer(self):
 
145
  try:
146
  self.global_settings.logger.debug("Starting highlight_gene_viewer")
147
 
@@ -167,30 +219,23 @@ class ViewTargetsController:
167
 
168
  # Get current gene sequence
169
  current_gene = self.view.combo_box_gene.currentText()
170
- self.global_settings.logger.debug(f"Current gene: {current_gene}")
 
171
 
172
- gene_data = self.model.get_gene_data(current_gene)
173
- if not gene_data:
174
- self.global_settings.logger.error("No gene data found")
 
175
  QMessageBox.warning(self.view, "No Gene Data",
176
- "Could not get gene data for highlighting.")
177
  return
178
 
179
- self.global_settings.logger.debug(f"Gene sequence length: {len(gene_data['sequence'])}")
180
 
181
  # Highlight the sequences
182
  if targets_to_highlight:
183
  self.global_settings.logger.debug("Attempting to highlight sequences")
184
- highlighted_sequence = self.model.highlight_targets_in_gene_viewer(targets_to_highlight)
185
-
186
- if highlighted_sequence:
187
- self.global_settings.logger.debug("Successfully highlighted sequences")
188
- self.global_settings.logger.debug(f"Highlighted sequence length: {len(highlighted_sequence)}")
189
- self.view.update_gene_viewer(highlighted_sequence)
190
- else:
191
- self.global_settings.logger.error("Failed to highlight sequences - returned None")
192
- QMessageBox.warning(self.view, "Highlighting Failed",
193
- "Could not highlight the selected sequences. They may not be found in the current gene view.")
194
  else:
195
  self.global_settings.logger.error("No valid targets to highlight")
196
  QMessageBox.warning(self.view, "No Valid Targets",
@@ -225,15 +270,23 @@ class ViewTargetsController:
225
  show_error(self.global_settings, "Error showing filter options", str(e))
226
 
227
  def show_scoring_options(self):
 
228
  try:
229
- scoring_options = self.model.get_scoring_options()
230
- self.view.show_scoring_options_dialog(scoring_options)
231
- if self.view.scoring_options_accepted():
232
- new_options = self.view.get_scoring_options()
233
- self.model.set_scoring_options(new_options)
234
- self.refresh_targets_display()
 
 
 
 
 
235
  except Exception as e:
236
- show_error(self.global_settings, "Error showing scoring options", str(e))
 
 
237
 
238
  def change_indices(self):
239
  try:
@@ -287,3 +340,196 @@ class ViewTargetsController:
287
 
288
  def show(self):
289
  self.view.show()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import logging
2
+ from controllers.ScoringOptionsController import ScoringOptionsController
3
  from models.ViewTargetsModel import ViewTargetsModel
4
  from views.ViewTargetsView import ViewTargetsView
5
  from PyQt6.QtWidgets import QMessageBox
6
  from utils.ui import show_error
7
  import time
8
+ from PyQt6 import QtWidgets, QtCore
9
  import traceback
10
  import threading
11
+ from Bio.Seq import Seq
12
 
13
  class ViewTargetsController:
14
  def __init__(self, global_settings):
 
34
  self.view.push_button_reset_location.clicked.connect(self.reset_location)
35
  self.view.check_box_select_all.stateChanged.connect(self.select_all)
36
  self.view.combo_box_gene.currentIndexChanged.connect(self.display_gene_data)
37
+ self.view.gene_selected.connect(self.on_gene_selected)
38
 
39
  def load_targets(self, selected_targets, organism, endonuclease):
40
  try:
 
55
  targets_time = time.time() - targets_start
56
  self.global_settings.logger.debug(f"Getting targets took: {targets_time:.2f} seconds")
57
 
58
+ # Get feature ID mapping from FindTargetsModel - Optimized with timing
59
+ genes_start = time.time()
60
+
61
+ # Time set creation
62
+ set_start = time.time()
63
+ seen_genes = set()
64
+ formatted_genes = []
65
+ set_time = time.time() - set_start
66
+ self.global_settings.logger.debug(f"Set initialization took: {set_time:.2f} seconds")
67
+
68
+ # Time target processing
69
+ process_start = time.time()
70
+ for target in selected_targets:
71
+ gene_name = target.get('feature_name')
72
+ feature_id = target.get('feature_id')
73
+
74
+ if gene_name and feature_id and gene_name not in seen_genes:
75
+ seen_genes.add(gene_name)
76
+ formatted_genes.append(f"{feature_id}: {gene_name}")
77
+ process_time = time.time() - process_start
78
+ self.global_settings.logger.debug(f"Target processing took: {process_time:.2f} seconds")
79
+
80
+ # Time sorting
81
+ sort_start = time.time()
82
+ formatted_genes.sort()
83
+ sort_time = time.time() - sort_start
84
+ self.global_settings.logger.debug(f"Sorting took: {sort_time:.2f} seconds")
85
+
86
+ # Time view update
87
+ view_start = time.time()
88
+ self.view.set_combo_box_gene(formatted_genes)
89
+ view_time = time.time() - view_start
90
+ self.global_settings.logger.debug(f"View update took: {view_time:.2f} seconds")
91
+
92
+ genes_time = time.time() - genes_start
93
+ self.global_settings.logger.debug(f"Total setting genes took: {genes_time:.2f} seconds")
94
+
95
  # Time displaying targets
96
  display_start = time.time()
97
  self.view.display_targets_in_table(targets)
 
117
  show_error(self.global_settings, "Error loading targets", str(e))
118
 
119
  def load_gene_viewer(self):
120
+ """Load gene viewer with sequence and location information"""
121
  try:
122
+ total_start = time.time()
123
+
124
+ # Get selected gene from combo box
125
+ combo_start = time.time()
126
+ selected_text = self.view.combo_box_gene.currentText()
127
+ if not selected_text:
128
+ self.global_settings.logger.debug("No gene selected")
129
+ return
130
+ combo_time = time.time() - combo_start
131
+ self.global_settings.logger.debug(f"Combo box access time: {combo_time:.2f} seconds")
132
+
133
+ # Extract locus tag from "locus_tag: gene_name" format
134
+ parse_start = time.time()
135
+ locus_tag = selected_text.split(': ')[0] if ': ' in selected_text else selected_text
136
+ self.global_settings.logger.debug(f"Loading sequence for locus tag: {locus_tag}")
137
+ parse_time = time.time() - parse_start
138
+ self.global_settings.logger.debug(f"Locus tag parsing time: {parse_time:.2f} seconds")
139
+
140
+ # Get gene sequence with padding
141
+ sequence_start = time.time()
142
+ sequence_data = self.model.get_gene_sequence(locus_tag)
143
+ sequence_time = time.time() - sequence_start
144
+ self.global_settings.logger.debug(f"Sequence retrieval time: {sequence_time:.2f} seconds")
145
+
146
+ if sequence_data:
147
+ # Update gene viewer with sequence
148
+ viewer_start = time.time()
149
+ self.view.set_text_edit_gene_viewer(sequence_data['sequence'])
150
+ viewer_time = time.time() - viewer_start
151
+ self.global_settings.logger.debug(f"Text viewer update time: {viewer_time:.2f} seconds")
152
 
153
+ # Update location fields
154
+ location_start = time.time()
155
+ self.view.line_edit_start_location.setText(str(sequence_data['start']))
156
+ self.view.line_edit_stop_location.setText(str(sequence_data['end']))
157
+ location_time = time.time() - location_start
158
+ self.global_settings.logger.debug(f"Location fields update time: {location_time:.2f} seconds")
159
 
160
+ total_time = time.time() - total_start
161
+ self.global_settings.logger.debug(f"Total gene viewer loading took: {total_time:.2f} seconds")
162
+ else:
163
+ self.global_settings.logger.warning(f"No sequence data found for locus tag {locus_tag}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
 
 
 
165
  except Exception as e:
166
+ self.global_settings.logger.error(f"Error in load_gene_viewer: {str(e)}")
167
+ self.global_settings.logger.error(f"Stack trace: {traceback.format_exc()}")
168
 
169
  def perform_off_target_analysis(self):
170
  try:
 
193
  show_error(self.global_settings, "Error in cotargeting", str(e))
194
 
195
  def highlight_gene_viewer(self):
196
+ """Highlight selected targets in gene viewer"""
197
  try:
198
  self.global_settings.logger.debug("Starting highlight_gene_viewer")
199
 
 
219
 
220
  # Get current gene sequence
221
  current_gene = self.view.combo_box_gene.currentText()
222
+ locus_tag = current_gene.split(': ')[0] if ': ' in current_gene else current_gene
223
+ self.global_settings.logger.debug(f"Getting sequence for locus tag: {locus_tag}")
224
 
225
+ # Get gene sequence with padding
226
+ sequence_data = self.model.get_gene_sequence(locus_tag)
227
+ if not sequence_data or 'sequence' not in sequence_data:
228
+ self.global_settings.logger.error("No sequence data found")
229
  QMessageBox.warning(self.view, "No Gene Data",
230
+ "Could not get gene sequence for highlighting.")
231
  return
232
 
233
+ self.global_settings.logger.debug(f"Gene sequence length: {len(sequence_data['sequence'])}")
234
 
235
  # Highlight the sequences
236
  if targets_to_highlight:
237
  self.global_settings.logger.debug("Attempting to highlight sequences")
238
+ self.highlight_targets_in_gene_viewer(targets_to_highlight)
 
 
 
 
 
 
 
 
 
239
  else:
240
  self.global_settings.logger.error("No valid targets to highlight")
241
  QMessageBox.warning(self.view, "No Valid Targets",
 
270
  show_error(self.global_settings, "Error showing filter options", str(e))
271
 
272
  def show_scoring_options(self):
273
+ """Show scoring options window"""
274
  try:
275
+ # Create scoring options controller if not exists
276
+ if not hasattr(self, '_scoring_options_controller'):
277
+ # Create controller with self as view_targets_controller
278
+ self._scoring_options_controller = ScoringOptionsController(
279
+ global_settings=self.global_settings,
280
+ view_targets_controller=self
281
+ )
282
+
283
+ # Show scoring options window
284
+ self._scoring_options_controller.show()
285
+
286
  except Exception as e:
287
+ self.global_settings.logger.error(f"Error showing scoring options: {str(e)}")
288
+ self.global_settings.logger.error(f"Stack trace: {traceback.format_exc()}")
289
+ show_error(self.global_settings, "Error", f"Could not show scoring options: {str(e)}")
290
 
291
  def change_indices(self):
292
  try:
 
340
 
341
  def show(self):
342
  self.view.show()
343
+
344
+ def on_gene_selected(self, selected_text):
345
+ """Handle gene selection signal"""
346
+ try:
347
+ # Extract locus tag from "locus_tag: gene_name" format
348
+ locus_tag = selected_text.split(': ')[0] if ': ' in selected_text else selected_text
349
+ self.global_settings.logger.debug(f"Loading sequence for locus tag: {locus_tag}")
350
+
351
+ # Get gene sequence with padding using locus tag
352
+ sequence_data = self.model.get_gene_sequence(locus_tag)
353
+ if sequence_data:
354
+ # Update gene viewer with sequence
355
+ self.view.set_text_edit_gene_viewer(sequence_data['sequence'])
356
+
357
+ # Update location fields
358
+ self.view.line_edit_start_location.setText(str(sequence_data['start']))
359
+ self.view.line_edit_stop_location.setText(str(sequence_data['end']))
360
+
361
+ self.global_settings.logger.debug(f"Updated gene viewer with sequence of length: {len(sequence_data['sequence'])}")
362
+ else:
363
+ self.global_settings.logger.warning(f"No sequence data found for locus tag {locus_tag}")
364
+ self.view.set_text_edit_gene_viewer("No sequence data available for this gene")
365
+ self.view.line_edit_start_location.clear()
366
+ self.view.line_edit_stop_location.clear()
367
+
368
+ except Exception as e:
369
+ self.global_settings.logger.error(f"Error handling gene selection: {str(e)}")
370
+ self.global_settings.logger.error(f"Stack trace: {traceback.format_exc()}")
371
+
372
+ def highlight_targets_in_gene_viewer(self, targets_to_highlight=None):
373
+ """Highlight selected targets in gene viewer"""
374
+ try:
375
+ self.global_settings.logger.debug("Starting highlight_gene_viewer")
376
+
377
+ # Get selected targets if none provided
378
+ if targets_to_highlight is None:
379
+ targets_to_highlight = self.view.get_selected_targets()
380
+
381
+ self.global_settings.logger.debug(f"Selected targets: {targets_to_highlight}")
382
+
383
+ if not targets_to_highlight:
384
+ QMessageBox.warning(self.view, "No Selection",
385
+ "Please select targets to highlight in the gene viewer.")
386
+ return
387
+
388
+ # Get current gene sequence
389
+ selected_text = self.view.combo_box_gene.currentText()
390
+ locus_tag = selected_text.split(': ')[0] if ': ' in selected_text else selected_text
391
+
392
+ sequence_data = self.model.get_gene_sequence(locus_tag)
393
+ if not sequence_data or 'sequence' not in sequence_data:
394
+ self.global_settings.logger.error("No sequence data available for highlighting")
395
+ return
396
+
397
+ sequence = sequence_data['sequence']
398
+
399
+ # Sort targets by position for efficient highlighting
400
+ highlights = []
401
+ sequences_found = 0
402
+ total_sequences = len(targets_to_highlight)
403
+
404
+ for target in targets_to_highlight:
405
+ self.global_settings.logger.debug(f"Processing target: {target}")
406
+ sequence_to_find = target['sequence']
407
+ strand = target['strand']
408
+
409
+ # For negative strand, we need to use reverse complement
410
+ if strand == '-':
411
+ sequence_to_find = str(Seq(sequence_to_find).reverse_complement())
412
+ self.global_settings.logger.debug(f"Reverse complemented sequence: {sequence_to_find}")
413
+
414
+ # Search for the sequence in the gene viewer text
415
+ sequence_upper = sequence.upper()
416
+ target_upper = sequence_to_find.upper()
417
+
418
+ self.global_settings.logger.debug(f"Searching for sequence: {target_upper}")
419
+
420
+ # Find all occurrences
421
+ pos = sequence_upper.find(target_upper)
422
+ if pos != -1:
423
+ self.global_settings.logger.debug(f"Found sequence at position: {pos}")
424
+ color = 'red' if strand == '-' else 'green'
425
+ highlights.append((pos, len(sequence_to_find), color))
426
+ sequences_found += 1
427
+ else:
428
+ self.global_settings.logger.debug(f"Sequence not found: {target_upper}")
429
+
430
+ # Only show warning if NO sequences were found
431
+ if sequences_found == 0:
432
+ self.global_settings.logger.warning("No sequences could be highlighted")
433
+ QMessageBox.warning(self.view, "Highlighting Failed",
434
+ "Could not highlight any of the selected sequences in the current gene view.")
435
+ return
436
+
437
+ self.global_settings.logger.debug(f"Found {sequences_found} out of {total_sequences} sequences to highlight")
438
+
439
+ # Build highlighted sequence
440
+ result = []
441
+ last_pos = 0
442
+ for pos, length, color in sorted(highlights): # Sort highlights by position
443
+ result.append(sequence[last_pos:pos])
444
+ result.append(f"<span style='background-color: {color};'>")
445
+ result.append(sequence[pos:pos+length])
446
+ result.append("</span>")
447
+ last_pos = pos + length
448
+
449
+ result.append(sequence[last_pos:])
450
+ highlighted_sequence = ''.join(result)
451
+
452
+ # Update the view with highlighted sequence
453
+ self.view.update_gene_viewer(highlighted_sequence)
454
+ self.global_settings.logger.debug(f"Successfully highlighted {sequences_found} sequences")
455
+
456
+ except Exception as e:
457
+ self.global_settings.logger.error(f"Error highlighting targets: {str(e)}")
458
+ self.global_settings.logger.error(f"Stack trace: {traceback.format_exc()}")
459
+
460
+ def update_scores(self, scores, algorithm):
461
+ """Update the table with new scores from alternative scoring methods"""
462
+ try:
463
+ # Get current table headers
464
+ headers = self.view.get_table_headers()
465
+
466
+ # Get selected rows
467
+ selected_rows = sorted(set(index.row() for index in self.view.table_targets.selectedIndexes()))
468
+ if not selected_rows:
469
+ self.global_settings.logger.warning("No rows selected for scoring")
470
+ return
471
+
472
+ # Determine the position for the new column (after the "Score" column)
473
+ score_index = headers.index("Score")
474
+ desired_index = score_index + 1
475
+
476
+ # Disable updates to prevent crashes
477
+ self.view.table_targets.setUpdatesEnabled(False)
478
+
479
+ try:
480
+ # Add new column for algorithm if it doesn't exist
481
+ if algorithm not in headers:
482
+ # Store current column count
483
+ current_cols = self.view.table_targets.columnCount()
484
+
485
+ # Insert new column after Score
486
+ self.view.table_targets.insertColumn(desired_index)
487
+
488
+ # Set header for new column
489
+ self.view.table_targets.setHorizontalHeaderItem(
490
+ desired_index,
491
+ QtWidgets.QTableWidgetItem(algorithm)
492
+ )
493
+
494
+ # Move Off-Target and Details columns one position right
495
+ for row in range(self.view.table_targets.rowCount()):
496
+ # Move Off-Target
497
+ off_target_item = self.view.table_targets.takeItem(row, desired_index)
498
+ if off_target_item:
499
+ self.view.table_targets.setItem(row, desired_index + 1, off_target_item)
500
+
501
+ # Move Details button
502
+ details_widget = self.view.table_targets.cellWidget(row, desired_index)
503
+ if details_widget:
504
+ self.view.table_targets.setCellWidget(row, desired_index + 1, details_widget)
505
+
506
+ col_index = desired_index
507
+ else:
508
+ col_index = headers.index(algorithm)
509
+
510
+ # Update scores in the table for selected rows only
511
+ for score_idx, row in enumerate(selected_rows):
512
+ if score_idx < len(scores) and scores[score_idx] != -1:
513
+ score_item = QtWidgets.QTableWidgetItem()
514
+ # Round to 2 decimal places
515
+ rounded_score = round(float(scores[score_idx]), 2)
516
+ score_item.setData(QtCore.Qt.ItemDataRole.EditRole, rounded_score)
517
+ self.view.table_targets.setItem(row, col_index, score_item)
518
+
519
+ # Also update the target data to preserve score during filtering/sorting
520
+ if hasattr(self.view, '_all_results'):
521
+ self.view._all_results[row]['azimuth_score'] = rounded_score
522
+
523
+ # Resize columns to fit new content
524
+ self.view.table_targets.resizeColumnsToContents()
525
+
526
+ self.global_settings.logger.debug(f"Updated scores for algorithm: {algorithm}")
527
+ self.global_settings.logger.debug(f"Updated rows: {selected_rows}")
528
+
529
+ finally:
530
+ # Re-enable updates
531
+ self.view.table_targets.setUpdatesEnabled(True)
532
+
533
+ except Exception as e:
534
+ self.global_settings.logger.error(f"Error updating scores: {str(e)}")
535
+ raise
src/models/AnnotationParser.py CHANGED
@@ -3,6 +3,9 @@ from Bio import SeqIO
3
  import os
4
  import traceback
5
  from functools import lru_cache
 
 
 
6
 
7
  class AnnotationParser:
8
  def __init__(self, global_settings):
@@ -10,36 +13,129 @@ class AnnotationParser:
10
  self.logger = global_settings.get_logger()
11
  self.annotation_file_name = ""
12
  self.available_genes = []
13
- self._feature_cache = {} # Cache for feature data
14
- self._record_cache = {} # Cache for SeqIO records
15
- self.gene_cache = {} # Add cache for gene data
 
16
 
17
  def set_annotation_file(self, file_path):
18
- if self.annotation_file_name != file_path:
19
- self.annotation_file_name = file_path
20
- self.logger.debug(f"Set annotation file to: {file_path}")
21
- self._feature_cache.clear() # Clear cache when file changes
22
- self._record_cache.clear()
23
- if hasattr(self, '_gene_index'):
24
- delattr(self, '_gene_index')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- # Pre-load records and build index
27
- records = self._get_records()
28
- self._build_gene_index(records)
29
- self._parse_available_genes()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- @lru_cache(maxsize=1)
32
- def _get_records(self):
33
- """Cache and return all records from the annotation file"""
34
- if not self._record_cache:
35
- try:
36
- self._record_cache = list(SeqIO.parse(self.annotation_file_name, "genbank"))
37
- except Exception as e:
38
- self.logger.error(f"Error reading annotation file: {str(e)}")
39
- return []
40
- return self._record_cache
 
 
 
 
 
 
 
 
 
 
41
 
42
  def genbank_search(self, queries):
 
43
  try:
44
  if not self.annotation_file_name:
45
  raise ValueError("Annotation file not set")
@@ -49,39 +145,86 @@ class AnnotationParser:
49
 
50
  # Convert queries to lowercase set for faster lookup
51
  queries = {q.lower() for q in queries}
 
52
 
53
- # Use cached records
54
- for record in self._get_records():
55
- for feature in record.features:
56
- if feature.type in ['CDS', 'gene']:
57
- # Create a hashable cache key using feature start and end positions
58
- cache_key = (record.id, feature.type,
59
- str(feature.location.start),
60
- str(feature.location.end))
61
-
62
- # Use cached feature info if available
63
- if cache_key not in self._feature_cache:
64
- self._feature_cache[cache_key] = self._get_feature_info(feature)
65
-
66
- feature_info = self._feature_cache[cache_key]
67
-
68
- # Combine searchable text for single comparison
69
- searchable_text = ' '.join([
70
- feature_info['feature_name'].lower(),
71
- feature_info['feature_id'].lower(),
72
- feature_info['feature_description'].lower()
73
- ])
 
74
 
75
- # Check if any query matches
76
- if any(query in searchable_text for query in queries):
77
- results_list.append((record.id, feature))
78
-
79
- self.logger.debug(f"Found {len(results_list)} results")
80
  return results_list
 
81
  except Exception as e:
82
  self.logger.error(f"Error in genbank_search: {str(e)}")
83
  raise
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  def get_max_chrom(self):
86
  try:
87
  parser = SeqIO.parse(self.annotation_file_name, 'genbank')
@@ -153,59 +296,9 @@ class AnnotationParser:
153
  def get_available_genes(self):
154
  return self.available_genes
155
 
156
- def get_gene_data(self, gene_identifier):
157
- """
158
- Get gene data using gene name or locus tag with optimized caching
159
- """
160
- try:
161
- self.logger.debug(f"AnnotationParser.get_gene_data called with identifier: {gene_identifier}")
162
-
163
- if not gene_identifier:
164
- self.logger.warning("Empty gene identifier provided")
165
- return None
166
-
167
- # Handle numeric gene identifiers
168
- if isinstance(gene_identifier, int) or str(gene_identifier).isdigit():
169
- if self.available_genes:
170
- gene_identifier = self.available_genes[0]
171
- else:
172
- return None
173
-
174
- # Check main cache first
175
- cache_key = f"gene_data_{gene_identifier}"
176
- if cache_key in self._feature_cache:
177
- return self._feature_cache[cache_key]
178
-
179
- # Get cached records
180
- records = self._get_records()
181
- if not records:
182
- return None
183
-
184
- # Use gene index if available
185
- if not hasattr(self, '_gene_index'):
186
- self._build_gene_index(records)
187
-
188
- # Try to get location from index
189
- if gene_identifier in self._gene_index:
190
- record_id, feature = self._gene_index[gene_identifier]
191
- for record in records:
192
- if record.id == record_id:
193
- sequence = str(feature.extract(record.seq))
194
- feature_info = self._get_feature_info(feature)
195
-
196
- result = {
197
- 'sequence': sequence,
198
- 'info': feature_info
199
- }
200
-
201
- self._feature_cache[cache_key] = result
202
- return result
203
-
204
- return None
205
-
206
- except Exception as e:
207
- self.logger.error(f"Error in get_gene_data: {str(e)}")
208
- return None
209
 
210
  def _build_gene_index(self, records):
211
  """Build an index of genes for faster lookup"""
@@ -233,6 +326,3 @@ class AnnotationParser:
233
  except Exception as e:
234
  self.logger.error(f"Error parsing available genes: {str(e)}")
235
 
236
- def get_full_gene_sequence(self):
237
- # Implement this method if needed
238
- pass
 
3
  import os
4
  import traceback
5
  from functools import lru_cache
6
+ import json
7
+ import pickle
8
+ import time
9
 
10
  class AnnotationParser:
11
  def __init__(self, global_settings):
 
13
  self.logger = global_settings.get_logger()
14
  self.annotation_file_name = ""
15
  self.available_genes = []
16
+ self._feature_cache = {}
17
+ self._record_cache = {}
18
+ self.gene_cache = {}
19
+ self.index_file = None
20
 
21
  def set_annotation_file(self, file_path):
22
+ try:
23
+ if self.annotation_file_name != file_path:
24
+ total_start = time.time()
25
+
26
+ self.annotation_file_name = file_path
27
+ self.logger.debug(f"Set annotation file to: {file_path}")
28
+
29
+ # Set index file path
30
+ self.index_file = f"{file_path}.index"
31
+
32
+ # Load or create index
33
+ index_start = time.time()
34
+ if not self._load_index():
35
+ self.logger.debug("Index not found or outdated, creating new index...")
36
+ create_start = time.time()
37
+ self._create_index()
38
+ create_time = time.time() - create_start
39
+ self.logger.debug(f"Index creation time: {create_time:.2f} seconds")
40
+ index_time = time.time() - index_start
41
+ self.logger.debug(f"Total index handling time: {index_time:.2f} seconds")
42
+
43
+ except Exception as e:
44
+ self.logger.error(f"Error in set_annotation_file: {str(e)}")
45
+ raise
46
+
47
+ def _create_index(self):
48
+ try:
49
+ start_time = time.time()
50
+ self.logger.debug("Creating gene index file...")
51
+
52
+ # Initialize index structure
53
+ index_data = {
54
+ 'locus_tags': {}, # Only store by locus_tag
55
+ 'sequences': {} # Keep sequences for quick access
56
+ }
57
 
58
+ # Process records
59
+ record_count = 0
60
+ feature_count = 0
61
+
62
+ for record in SeqIO.parse(self.annotation_file_name, "genbank"):
63
+ record_count += 1
64
+ record_start = time.time()
65
+
66
+ # Store sequence information first
67
+ index_data['sequences'][record.id] = str(record.seq)
68
+
69
+ # Process features
70
+ for feature in record.features:
71
+ if feature.type in ['CDS', 'gene']:
72
+ feature_count += 1
73
+ feature_info = self._get_feature_info(feature)
74
+ locus_tag = feature_info['feature_id']
75
+
76
+ # Only create feature entry if we have a valid locus_tag
77
+ if locus_tag and locus_tag.lower() != "n/a":
78
+ feature_entry = {
79
+ 'record_id': record.id,
80
+ 'feature_type': feature.type,
81
+ 'chromosome': record.id,
82
+ 'location': self._get_feature_location(feature),
83
+ 'strand': '+' if feature.location.strand == 1 else '-',
84
+ 'locus_tag': locus_tag,
85
+ 'gene_name': feature_info['feature_name'],
86
+ 'description': feature_info['feature_description'],
87
+ 'qualifiers': {k: v[0] if isinstance(v, list) else v
88
+ for k, v in feature.qualifiers.items()}
89
+ }
90
+
91
+ # Index only by locus_tag (lowercase for case-insensitive lookup)
92
+ index_data['locus_tags'][locus_tag.lower()] = feature_entry
93
+
94
+ record_time = time.time() - record_start
95
+ if record_count % 100 == 0:
96
+ self.logger.debug(f"Processed {record_count} records, {feature_count} features. Last record time: {record_time:.2f}s")
97
+
98
+ # Save index to file
99
+ save_start = time.time()
100
+ with open(self.index_file, 'wb') as f:
101
+ pickle.dump(index_data, f)
102
+ save_time = time.time() - save_start
103
+
104
+ total_time = time.time() - start_time
105
+
106
+ self._index = index_data
107
+
108
+ self.logger.debug(f"Index creation complete. Records: {record_count}, Features: {feature_count}")
109
+ self.logger.debug(f"Save time: {save_time:.2f}s, Total time: {total_time:.2f}s")
110
+ return True
111
+
112
+ except Exception as e:
113
+ self.logger.error(f"Error creating index: {str(e)}")
114
+ return False
115
 
116
+ def _load_index(self):
117
+ """Load the index file if it exists and is newer than the GenBank file"""
118
+ try:
119
+ if not os.path.exists(self.index_file):
120
+ return False
121
+
122
+ # Check if index is older than GenBank file
123
+ if os.path.getmtime(self.index_file) < os.path.getmtime(self.annotation_file_name):
124
+ return False
125
+
126
+ start_time = time.time()
127
+ with open(self.index_file, 'rb') as f:
128
+ self._index = pickle.load(f)
129
+ load_time = time.time() - start_time
130
+ self.logger.debug(f"Index file loaded successfully in {load_time:.2f} seconds")
131
+ return True
132
+
133
+ except Exception as e:
134
+ self.logger.error(f"Error loading index: {str(e)}")
135
+ return False
136
 
137
  def genbank_search(self, queries):
138
+ """Search using the index file for better performance"""
139
  try:
140
  if not self.annotation_file_name:
141
  raise ValueError("Annotation file not set")
 
145
 
146
  # Convert queries to lowercase set for faster lookup
147
  queries = {q.lower() for q in queries}
148
+ print(f"Search queries: {queries}")
149
 
150
+ # Search through index
151
+ if hasattr(self, '_index'):
152
+ # Search through all features
153
+ for feature_key, feature_entry in self._index['locus_tags'].items():
154
+ # Check gene name, locus tag, and description
155
+ searchable_text = ' '.join([
156
+ feature_entry['gene_name'].lower(),
157
+ feature_entry['locus_tag'].lower(),
158
+ feature_entry['description'].lower(),
159
+ # Also search through qualifiers
160
+ *[str(v).lower() for v in feature_entry['qualifiers'].values()]
161
+ ])
162
+
163
+ # Check if any query matches
164
+ if any(query in searchable_text for query in queries):
165
+ info = {
166
+ 'feature_id': feature_entry['locus_tag'],
167
+ 'feature_name': feature_entry['gene_name'],
168
+ 'feature_location': feature_entry['location'],
169
+ 'feature_description': feature_entry['description']
170
+ }
171
+ results_list.append((feature_entry['record_id'], info))
172
 
 
 
 
 
 
173
  return results_list
174
+
175
  except Exception as e:
176
  self.logger.error(f"Error in genbank_search: {str(e)}")
177
  raise
178
 
179
+ def get_gene_data(self, gene_identifier):
180
+ """Get gene data using the index for faster retrieval"""
181
+ try:
182
+ if not gene_identifier:
183
+ return None
184
+
185
+ # Ensure string conversion and proper formatting
186
+ gene_identifier = str(gene_identifier).strip().lower()
187
+
188
+ if hasattr(self, '_index'):
189
+ # Try exact match first
190
+ if gene_identifier in self._index['locus_tags']:
191
+ gene_info = self._index['locus_tags'][gene_identifier]
192
+ record_id = gene_info['record_id']
193
+ return {
194
+ 'sequence': self._index['sequences'][record_id],
195
+ 'info': gene_info
196
+ }
197
+
198
+ # Try case-insensitive match
199
+ for key, value in self._index['locus_tags'].items():
200
+ if str(key).lower() == gene_identifier:
201
+ record_id = value['record_id']
202
+ return {
203
+ 'sequence': self._index['sequences'][record_id],
204
+ 'info': value
205
+ }
206
+
207
+ return None
208
+
209
+ except Exception as e:
210
+ self.logger.error(f"Error in get_gene_data: {str(e)}")
211
+ return None
212
+
213
+ @lru_cache(maxsize=1)
214
+ def _get_records(self):
215
+ """Cache and return all records from the annotation file"""
216
+ start_time = time.time()
217
+ if not self._record_cache:
218
+ try:
219
+ self.logger.debug("Loading records from file...")
220
+ self._record_cache = list(SeqIO.parse(self.annotation_file_name, "genbank"))
221
+ load_time = time.time() - start_time
222
+ self.logger.debug(f"Time to load records: {load_time:.2f} seconds")
223
+ except Exception as e:
224
+ self.logger.error(f"Error reading annotation file: {str(e)}")
225
+ return []
226
+ return self._record_cache
227
+
228
  def get_max_chrom(self):
229
  try:
230
  parser = SeqIO.parse(self.annotation_file_name, 'genbank')
 
296
  def get_available_genes(self):
297
  return self.available_genes
298
 
299
+ def get_full_gene_sequence(self):
300
+ # Implement this method if needed
301
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
 
303
  def _build_gene_index(self, records):
304
  """Build an index of genes for faster lookup"""
 
326
  except Exception as e:
327
  self.logger.error(f"Error parsing available genes: {str(e)}")
328
 
 
 
 
src/models/CSPRparser.py CHANGED
@@ -2,101 +2,193 @@ from utils.sequence_utils import SeqTranslate
2
  import logging
3
  from multiprocessing import Pool, cpu_count
4
  from functools import partial
 
 
 
 
5
 
6
  class CSPRparser:
7
  def __init__(self, inputFileName, casper_info_path):
8
  self.fileName = inputFileName
9
- self.filename = inputFileName
10
  self.seqTrans = SeqTranslate(casper_info_path)
11
  self.logger = logging.getLogger(__name__)
12
- self._line_buffer = [] # Pre-allocate buffer for lines
13
  self._cached_results = {}
 
14
 
15
- def read_targets_batch(self, chromosome, targets, endonuclease):
16
- """Ultra-fast target reading using direct tuple creation"""
17
  try:
18
- # Pre-process targets into a sorted list of ranges for faster lookup
19
- target_ranges = []
20
- for t in targets:
21
- start = t['start']
22
- end = t['end']
23
- target_ranges.append((start, end, t['feature_name']))
24
- target_ranges.sort() # Sort by start position
25
-
26
- # Pre-allocate results list
27
- results = []
28
- results_append = results.append
29
 
30
- # Read file in binary mode for speed
31
  with open(self.fileName, 'rb') as f:
32
- # Skip header
33
  for _ in range(3):
34
  f.readline()
35
 
36
- # Find chromosome section
37
- header = False
 
 
38
  for line in f:
39
- if b'>' in line and str(chromosome).encode() in line:
40
- header = True
41
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- # Read targets
44
- if header:
45
- current_range_idx = 0
46
- max_ranges = len(target_ranges)
 
 
47
 
48
- while current_range_idx < max_ranges:
49
- line = f.readline()
50
- if not line or line.startswith(b'>'):
51
- break
52
-
53
- if not line.strip():
54
- continue
55
-
56
- # Fast string splitting without decode
57
- parts = line.strip().split(b',')
58
- if not parts:
59
- continue
60
-
 
 
 
 
 
 
61
  try:
62
- pos = int(parts[0])
63
- abs_pos = abs(pos)
64
 
65
- # Get current target range
66
- start, end, feature_name = target_ranges[current_range_idx]
67
-
68
- # Skip if position is past current range
69
- if abs_pos >= end:
70
- current_range_idx += 1
71
- continue
72
-
73
- # Check if position is in range
74
- if start <= abs_pos < end:
75
- sequence = parts[1].decode()
76
- pam = sequence[-3:]
77
- target_seq = sequence[:-3]
78
-
79
- results_append({
80
  'feature_name': feature_name,
81
- 'chromosome': chromosome,
82
- 'position': abs_pos,
83
- 'location': f"{abs_pos}-{abs_pos + 23}",
84
- 'sequence': target_seq,
85
- 'pam': pam,
 
86
  'strand': "-" if pos < 0 else "+",
87
- 'score': float(parts[3]) if len(parts) > 3 else 0.0,
88
  'endonuclease': endonuclease
89
  })
90
-
91
- except (ValueError, IndexError):
92
- continue
 
 
 
 
 
 
 
 
 
 
 
93
 
94
  return results
95
-
96
  except Exception as e:
97
  self.logger.error(f"Error in read_targets_batch: {str(e)}")
 
98
  return []
99
-
100
  def parse_targets(self, file_path, region):
101
  """Parse targets with parallel processing and caching"""
102
  cache_key = f"{file_path}:{region}"
@@ -127,3 +219,4 @@ class CSPRparser:
127
  chunk_end = chunk_start + chunk_size if i < cpu_count()-1 else end
128
  chunks.append((chunk_start, chunk_end))
129
  return chunks
 
 
2
  import logging
3
  from multiprocessing import Pool, cpu_count
4
  from functools import partial
5
+ import time
6
+ import pickle
7
+ import os
8
+ import traceback
9
 
10
  class CSPRparser:
11
  def __init__(self, inputFileName, casper_info_path):
12
  self.fileName = inputFileName
 
13
  self.seqTrans = SeqTranslate(casper_info_path)
14
  self.logger = logging.getLogger(__name__)
 
15
  self._cached_results = {}
16
+ self.index_file = f"{inputFileName}.index"
17
 
18
+ def _create_index(self):
19
+ """Create an index file for faster searching"""
20
  try:
21
+ start_time = time.time()
22
+ self.logger.debug("Creating CSPR index file...")
23
+
24
+ # Initialize index structure
25
+ index_data = {}
 
 
 
 
 
 
26
 
 
27
  with open(self.fileName, 'rb') as f:
28
+ # Skip header lines
29
  for _ in range(3):
30
  f.readline()
31
 
32
+ current_chrom = None
33
+ chrom_data = []
34
+
35
+ # Process file line by line
36
  for line in f:
37
+ if line.startswith(b'>'):
38
+ # Save previous chromosome data if exists
39
+ if current_chrom and chrom_data:
40
+ index_data[current_chrom] = chrom_data
41
+
42
+ # Start new chromosome
43
+ current_chrom = line.decode().split()[0][1:] # Remove '>' and get chromosome id
44
+ chrom_data = []
45
+ continue
46
+
47
+ if not line.strip():
48
+ continue
49
+
50
+ try:
51
+ # Parse position and store line offset
52
+ first_comma = line.find(b',')
53
+ if first_comma != -1:
54
+ pos = int(line[:first_comma])
55
+ abs_pos = abs(pos)
56
+ chrom_data.append((abs_pos, line))
57
+ except ValueError:
58
+ continue
59
+
60
+ # Save last chromosome data
61
+ if current_chrom and chrom_data:
62
+ index_data[current_chrom] = chrom_data
63
+
64
+ # Save index to file
65
+ with open(self.index_file, 'wb') as f:
66
+ pickle.dump(index_data, f)
67
+
68
+ self._index = index_data
69
+
70
+ create_time = time.time() - start_time
71
+ self.logger.debug(f"Index creation time: {create_time:.2f} seconds")
72
+ return True
73
+
74
+ except Exception as e:
75
+ self.logger.error(f"Error creating index: {str(e)}")
76
+ return False
77
+
78
+ def _load_index(self):
79
+ try:
80
+ if not os.path.exists(self.index_file):
81
+ return False
82
+
83
+ if os.path.getmtime(self.index_file) < os.path.getmtime(self.fileName):
84
+ return False
85
+
86
+ with open(self.index_file, 'rb') as f:
87
+ self._index = pickle.load(f)
88
+ return True
89
+
90
+ except Exception as e:
91
+ self.logger.error(f"Error loading index: {str(e)}")
92
+ return False
93
+
94
+ def read_targets_batch(self, chromosome, targets, endonuclease):
95
+ try:
96
+ start_time = time.time()
97
+
98
+ # Load or create index
99
+ if not hasattr(self, '_index'):
100
+ if not self._load_index():
101
+ self._create_index()
102
+
103
+ # Sort targets by start position
104
+ sorted_targets = sorted(targets, key=lambda x: x['start'])
105
+ min_start = sorted_targets[0]['start']
106
+ max_end = max(t['end'] for t in sorted_targets)
107
+
108
+ self.logger.debug(f"Processing targets from {min_start} to {max_end}")
109
+ self.logger.debug(f"Looking for chromosome number: {chromosome}")
110
+
111
+ results = []
112
+ lines_processed = 0
113
+ lines_skipped = 0
114
+
115
+ # Find chromosome in index by counting carets
116
+ found_chrom = None
117
+ chrom_count = 0
118
+ target_chrom_num = int(chromosome) # Convert chromosome to integer
119
+
120
+ # Debug available chromosomes
121
+ self.logger.debug(f"Available chromosomes: {list(self._index.keys())}")
122
+
123
+ for chrom_id in self._index:
124
+ # Decode bytes to string if necessary
125
+ chrom_str = chrom_id.decode() if isinstance(chrom_id, bytes) else chrom_id
126
 
127
+ # Count carets ('>') to find the right chromosome
128
+ chrom_count += 1
129
+ if chrom_count == target_chrom_num:
130
+ found_chrom = chrom_id
131
+ self.logger.debug(f"Found matching chromosome: {chrom_str}")
132
+ break
133
 
134
+ if found_chrom:
135
+ chrom_data = self._index[found_chrom]
136
+
137
+ # Binary search for start position
138
+ start_idx = 0
139
+ end_idx = len(chrom_data)
140
+
141
+ for target in sorted_targets:
142
+ target_start = target['start']
143
+ target_end = target['end']
144
+ feature_id = target.get('feature_id', '')
145
+ feature_name = target.get('feature_name', '')
146
+
147
+ # Find relevant positions for this target
148
+ while start_idx < end_idx and chrom_data[start_idx][0] < target_start:
149
+ start_idx += 1
150
+
151
+ current_idx = start_idx
152
+ while current_idx < end_idx and chrom_data[current_idx][0] < target_end:
153
  try:
154
+ pos, line = chrom_data[current_idx]
155
+ parts = line.split(b',')
156
 
157
+ if len(parts) >= 4:
158
+ pos = int(parts[0])
159
+ results.append({
 
 
 
 
 
 
 
 
 
 
 
 
160
  'feature_name': feature_name,
161
+ 'feature_id': feature_id,
162
+ 'chromosome': found_chrom,
163
+ 'position': abs(pos),
164
+ 'location': f"{abs(pos)}-{abs(pos) + 23}",
165
+ 'sequence': parts[1].decode(),
166
+ 'pam': parts[2].decode(),
167
  'strand': "-" if pos < 0 else "+",
168
+ 'score': float(parts[3]),
169
  'endonuclease': endonuclease
170
  })
171
+ lines_processed += 1
172
+
173
+ except (ValueError, IndexError) as e:
174
+ self.logger.error(f"Error processing line: {str(e)}")
175
+ lines_skipped += 1
176
+
177
+ current_idx += 1
178
+ else:
179
+ self.logger.error(f"Chromosome {chromosome} not found in index")
180
+ self.logger.debug(f"Available chromosomes: {list(self._index.keys())}")
181
+
182
+ total_time = time.time() - start_time
183
+ self.logger.debug(f"Processed {lines_processed} lines, skipped {lines_skipped}")
184
+ self.logger.debug(f"Found {len(results)} targets in {total_time:.2f} seconds")
185
 
186
  return results
187
+
188
  except Exception as e:
189
  self.logger.error(f"Error in read_targets_batch: {str(e)}")
190
+ self.logger.error(f"Stack trace: {traceback.format_exc()}")
191
  return []
 
192
  def parse_targets(self, file_path, region):
193
  """Parse targets with parallel processing and caching"""
194
  cache_key = f"{file_path}:{region}"
 
219
  chunk_end = chunk_start + chunk_size if i < cpu_count()-1 else end
220
  chunks.append((chunk_start, chunk_end))
221
  return chunks
222
+
src/models/FindTargetsModel.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from models.HomeWindowModel import HomeWindowModel
2
  from models.CSPRparser import CSPRparser
3
  from models.AnnotationParser import AnnotationParser
@@ -26,16 +27,24 @@ class FindTargetsModel(HomeWindowModel):
26
  def find_targets(self, input_data):
27
  self.global_settings.logger.debug(f"Received input data: {input_data}")
28
 
 
 
29
  organism = input_data['organism']
30
  endo = input_data['endonuclease']
31
  org_files = self.get_organism_to_files()
32
 
33
  # Validate input data
 
34
  self._validate_input(organism, endo, org_files)
 
 
35
 
36
  # Get file path and parser
 
37
  file_path = os.path.join(self.global_settings.get_db_path(), org_files[organism][endo][0])
38
  parser = self._get_parser(file_path)
 
 
39
 
40
  # Use dictionary for faster lookup
41
  search_types = {
@@ -50,7 +59,15 @@ class FindTargetsModel(HomeWindowModel):
50
  self.global_settings.logger.error(error_msg)
51
  raise ValueError(error_msg)
52
 
 
 
53
  self.results = search_func(parser, input_data)
 
 
 
 
 
 
54
  return self.results
55
 
56
  def _validate_input(self, organism, endo, org_files):
@@ -66,85 +83,78 @@ class FindTargetsModel(HomeWindowModel):
66
  raise ValueError(error_msg)
67
 
68
  def find_targets_by_feature(self, parser, input_data):
69
- # Get annotation file from input data or global settings
70
- annotation_file = (input_data.get('annotation_file') or
71
- self.global_settings.get_current_annotation_file())
72
-
73
- search_query = input_data['search_query'].strip().lower()
74
-
75
- # Create new annotation parser instance for each search
76
- annotation_parser = AnnotationParser(self.global_settings)
77
- annotation_file_path = os.path.join(self.global_settings.get_db_path(), 'GBFF', annotation_file)
78
- annotation_parser.set_annotation_file(annotation_file_path)
79
-
80
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  results_list = annotation_parser.genbank_search([search_query])
82
- self.global_settings.logger.debug(f"Genbank search results: {results_list}")
83
- except Exception as e:
84
- self.global_settings.logger.error(f"Error in genbank_search: {str(e)}")
85
- return []
86
-
87
- # Use a set for faster lookups
88
- search_terms = {search_query}
89
- formatted_results = []
90
-
91
- # Pre-calculate feature info once for each feature
92
- for chrom, feature in results_list:
93
- if feature.type in ['CDS']:
94
- feature_info = self._get_feature_info(feature)
95
 
96
- # Combine all searchable text into one string for a single search operation
97
- searchable_text = ' '.join([
98
- feature_info['feature_name'].lower(),
99
- feature_info['feature_id'].lower(),
100
- feature_info['feature_description'].lower()
101
- ])
102
 
103
- # Single check if any search term is in the searchable text
104
- if any(term in searchable_text for term in search_terms):
105
- formatted_results.append({
106
- 'feature_type': feature.type,
107
- 'chromosome': chrom,
108
- 'feature_id': feature_info['feature_id'],
109
- 'feature_name': feature_info['feature_name'],
110
- 'feature_description': feature_info['feature_description'],
111
- 'location': f"{feature.location.start}-{feature.location.end}",
112
- 'strand': '+' if feature.strand == 1 else '-'
113
- })
114
-
115
- self.global_settings.logger.debug(f"Total features found: {len(formatted_results)}")
116
- return formatted_results
117
-
118
- def _get_feature_info(self, feature):
119
- return {
120
- 'feature_id': self._get_feature_id(feature),
121
- 'feature_name': self._get_feature_name(feature),
122
- 'feature_description': self._get_feature_description(feature)
123
- }
124
-
125
- def _find_associated_cds(self, gene_feature):
126
- for feature in gene_feature.parent.features:
127
- if feature.type == 'CDS' and feature.location == gene_feature.location:
128
- return feature
129
- return None
130
-
131
- def _get_feature_id(self, feature):
132
- for key in ['locus_tag', 'protein_id', 'id']:
133
- if key in feature.qualifiers:
134
- return feature.qualifiers[key][0]
135
- return "N/A"
136
-
137
- def _get_feature_name(self, feature):
138
- for key in ['gene', 'product']:
139
- if key in feature.qualifiers:
140
- return feature.qualifiers[key][0]
141
- return "N/A"
142
-
143
- def _get_feature_description(self, feature):
144
- for key in ['product', 'note']:
145
- if key in feature.qualifiers:
146
- return feature.qualifiers[key][0]
147
- return "N/A"
148
 
149
  def find_targets_by_position(self, parser, input_data):
150
  search_query = input_data['search_query']
 
1
+ import time
2
  from models.HomeWindowModel import HomeWindowModel
3
  from models.CSPRparser import CSPRparser
4
  from models.AnnotationParser import AnnotationParser
 
27
  def find_targets(self, input_data):
28
  self.global_settings.logger.debug(f"Received input data: {input_data}")
29
 
30
+ start_time = time.time()
31
+
32
  organism = input_data['organism']
33
  endo = input_data['endonuclease']
34
  org_files = self.get_organism_to_files()
35
 
36
  # Validate input data
37
+ validate_start = time.time()
38
  self._validate_input(organism, endo, org_files)
39
+ validate_time = time.time() - validate_start
40
+ self.global_settings.logger.debug(f"Validation time: {validate_time:.2f} seconds")
41
 
42
  # Get file path and parser
43
+ parser_start = time.time()
44
  file_path = os.path.join(self.global_settings.get_db_path(), org_files[organism][endo][0])
45
  parser = self._get_parser(file_path)
46
+ parser_time = time.time() - parser_start
47
+ self.global_settings.logger.debug(f"Parser initialization time: {parser_time:.2f} seconds")
48
 
49
  # Use dictionary for faster lookup
50
  search_types = {
 
59
  self.global_settings.logger.error(error_msg)
60
  raise ValueError(error_msg)
61
 
62
+ # Perform the search
63
+ search_start = time.time()
64
  self.results = search_func(parser, input_data)
65
+ search_time = time.time() - search_start
66
+ self.global_settings.logger.debug(f"Search execution time: {search_time:.2f} seconds")
67
+
68
+ total_time = time.time() - start_time
69
+ self.global_settings.logger.debug(f"Total find_targets time: {total_time:.2f} seconds")
70
+
71
  return self.results
72
 
73
  def _validate_input(self, organism, endo, org_files):
 
83
  raise ValueError(error_msg)
84
 
85
  def find_targets_by_feature(self, parser, input_data):
86
+ """Search for features using the indexed annotation parser"""
 
 
 
 
 
 
 
 
 
 
87
  try:
88
+ start_time = time.time()
89
+
90
+ # Get annotation file from input data or global settings
91
+ annotation_file = (input_data.get('annotation_file') or
92
+ self.global_settings.get_current_annotation_file())
93
+
94
+ search_query = input_data['search_query'].strip()
95
+
96
+ # Create new annotation parser instance
97
+ parser_start = time.time()
98
+ annotation_parser = AnnotationParser(self.global_settings)
99
+ annotation_file_path = os.path.join(self.global_settings.get_db_path(), 'GBFF', annotation_file)
100
+ annotation_parser.set_annotation_file(annotation_file_path)
101
+ parser_time = time.time() - parser_start
102
+ self.global_settings.logger.debug(f"Annotation parser initialization time: {parser_time:.2f} seconds")
103
+
104
+ # Use indexed search
105
+ search_start = time.time()
106
  results_list = annotation_parser.genbank_search([search_query])
107
+ search_time = time.time() - search_start
108
+ self.global_settings.logger.debug(f"Genbank search time: {search_time:.2f} seconds")
109
+
110
+ # Format results
111
+ format_start = time.time()
112
+ formatted_results = []
113
+ for record_id, feature_info in results_list:
114
+ # Extract start and end from feature_location
115
+ location = feature_info['feature_location']
116
+ start_end = location.split('(')[0] # Get part before the strand
117
+ start, end = map(int, start_end.split(':'))
 
 
118
 
119
+ # Extract chromosome number from record_id (e.g., "NZ_CP132594.1" -> "1")
120
+ chrom_num = record_id.split('.')[-1] if '.' in record_id else '1'
 
 
 
 
121
 
122
+ # Create target info with feature_id and chromosome number
123
+ target_info = {
124
+ 'feature_type': 'CDS',
125
+ 'chromosome': chrom_num, # Use chromosome number
126
+ 'full_chromosome': record_id, # Store full chromosome name for reference
127
+ 'feature_id': feature_info['feature_id'],
128
+ 'feature_name': feature_info['feature_name'],
129
+ 'feature_description': feature_info['feature_description'],
130
+ 'location': f"{start}-{end}",
131
+ 'start': start,
132
+ 'end': end,
133
+ 'strand': '+' if '(+)' in location else '-',
134
+ 'endonuclease': input_data['endonuclease']
135
+ }
136
+
137
+ # Debug log the target info
138
+ self.global_settings.logger.debug(f"Created target info: {target_info}")
139
+
140
+ formatted_results.append(target_info)
141
+
142
+ format_time = time.time() - format_start
143
+ self.global_settings.logger.debug(f"Result formatting time: {format_time:.2f} seconds")
144
+
145
+ # Debug log sample results
146
+ if formatted_results:
147
+ self.global_settings.logger.debug(f"Sample formatted result: {formatted_results[0]}")
148
+ self.global_settings.logger.debug(f"Feature IDs present: {[r['feature_id'] for r in formatted_results[:5]]}")
149
+
150
+ total_time = time.time() - start_time
151
+ self.global_settings.logger.debug(f"Total find_targets_by_feature time: {total_time:.2f} seconds")
152
+
153
+ return formatted_results
154
+
155
+ except Exception as e:
156
+ self.global_settings.logger.error(f"Error in find_targets_by_feature: {str(e)}")
157
+ raise
 
 
 
 
 
 
 
 
 
158
 
159
  def find_targets_by_position(self, parser, input_data):
160
  search_query = input_data['search_query']
src/models/GlobalSettings.py CHANGED
@@ -285,5 +285,10 @@ class GlobalSettings(QObject):
285
  self._current_annotation_file = self._current_home_window.get_annotation_file()
286
  return self._current_annotation_file
287
 
 
 
 
 
 
288
  # Global instance
289
  global_settings = None
 
285
  self._current_annotation_file = self._current_home_window.get_annotation_file()
286
  return self._current_annotation_file
287
 
288
+ def get_scoring_options_window(self, view_targets_controller):
289
+ """Create and return ScoringOptionsController instance"""
290
+ from controllers.ScoringOptionsController import ScoringOptionsController
291
+ return ScoringOptionsController(self, view_targets_controller)
292
+
293
  # Global instance
294
  global_settings = None
src/models/ScoringOptionsModel.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from Bio import SeqIO
3
+ import traceback
4
+ import warnings
5
+ import contextlib
6
+ import sys
7
+ import os
8
+
9
+ class ScoringOptionsModel:
10
+ def __init__(self, global_settings):
11
+ self.global_settings = global_settings
12
+ self.logger = global_settings.get_logger()
13
+ self.genome = ""
14
+ self.rev_genome = ""
15
+ self.fasta_path = ""
16
+
17
+ def load_fasta(self, fasta_path, chromosome):
18
+ """Load FASTA file for the specified chromosome"""
19
+ try:
20
+ self.fasta_path = fasta_path
21
+
22
+ # Extract chromosome number from ID (e.g., "NZ_CP032679.1" -> "1")
23
+ chrom_num = int(chromosome.split('.')[-1]) if '.' in chromosome else 1
24
+ self.logger.debug(f"Looking for chromosome number: {chrom_num}")
25
+
26
+ # Load only the required chromosome
27
+ for i, record in enumerate(SeqIO.parse(fasta_path, "fasta")):
28
+ if i + 1 == chrom_num: # 1-based indexing
29
+ self.genome = str(record.seq).upper()
30
+ self.rev_genome = str(record.seq.reverse_complement()).upper()
31
+ self.logger.debug(f"Loaded chromosome {chrom_num} sequence of length {len(self.genome)}")
32
+ break
33
+
34
+ return True
35
+
36
+ except Exception as e:
37
+ self.logger.error(f"Error loading FASTA file: {str(e)}")
38
+ self.logger.error(f"Stack trace: {traceback.format_exc()}")
39
+ return False
40
+
41
+ def score_sequences(self, targets, algorithm="Azimuth 2.0"):
42
+ """Score the target sequences using the specified algorithm"""
43
+ try:
44
+ guide_list = []
45
+ reject_list = []
46
+ full_seqs = []
47
+
48
+ # Process each target
49
+ for i, target in enumerate(targets):
50
+ strand = target['strand']
51
+ sequence = target['sequence'] + target['pam']
52
+ guide_list.append(sequence)
53
+
54
+ # Search for sequence in genome
55
+ if strand == "+":
56
+ pos = self.genome.find(sequence)
57
+ if pos != -1:
58
+ full_seqs.append(self.genome[pos-4:pos+26])
59
+ else:
60
+ reject_list.append(i)
61
+ else:
62
+ pos = self.rev_genome.find(sequence)
63
+ if pos != -1:
64
+ full_seqs.append(self.rev_genome[pos-4:pos+26])
65
+ else:
66
+ reject_list.append(i)
67
+
68
+ # Score sequences if any were found
69
+ if full_seqs:
70
+ full_seqs = np.array(full_seqs)
71
+
72
+ # Score using selected algorithm
73
+ if algorithm == "Azimuth 2.0":
74
+ with warnings.catch_warnings():
75
+ warnings.simplefilter("ignore")
76
+ # Add utils directory to Python path
77
+ utils_path = os.path.join(self.global_settings.get_src_dir_path(), 'utils')
78
+ if utils_path not in sys.path:
79
+ sys.path.append(utils_path)
80
+
81
+ from azimuth import model_comparison as az
82
+ scores = az.predict(full_seqs) * 100
83
+ else:
84
+ raise ValueError(f"Unknown algorithm: {algorithm}")
85
+
86
+ # Insert -1 scores for rejected sequences
87
+ for i in reject_list:
88
+ scores = np.insert(scores, i, -1)
89
+
90
+ return scores, reject_list, guide_list
91
+
92
+ return None, reject_list, guide_list
93
+
94
+ except Exception as e:
95
+ self.logger.error(f"Error scoring sequences: {str(e)}")
96
+ self.logger.error(f"Stack trace: {traceback.format_exc()}")
97
+ return None, [], []
src/models/ViewTargetsModel.py CHANGED
@@ -40,6 +40,17 @@ class ViewTargetsModel(HomeWindowModel):
40
  self._chromosome_seqs = {}
41
  self._cached_targets = {} # Add cache for targets
42
 
 
 
 
 
 
 
 
 
 
 
 
43
  def cleanup(self):
44
  """Cleanup method to be called when the view is closed"""
45
  try:
@@ -59,7 +70,7 @@ class ViewTargetsModel(HomeWindowModel):
59
  def _on_annotation_file_changed(self, new_annotation_file):
60
  """Clear all caches when annotation file changes"""
61
  try:
62
- self.global_settings.logger.debug(f"ViewTargetsModel clearing caches for new annotation file: {new_annotation_file}")
63
  self._gene_data_cache.clear()
64
  self._sequence_cache.clear()
65
  self._parser_cache.clear()
@@ -77,98 +88,91 @@ class ViewTargetsModel(HomeWindowModel):
77
  self._chromosome_seqs = {}
78
 
79
  except Exception as e:
80
- self.global_settings.logger.error(f"Error in _on_annotation_file_changed: {str(e)}")
81
 
82
  def load_targets(self, selected_targets, organism, endonuclease):
83
  """Fast target loading with minimal file operations"""
84
- start_time = time.time()
85
 
86
  try:
87
- self.global_settings.logger.debug(f"Starting load_targets with {len(selected_targets)} targets")
88
 
89
  # Store organism and endonuclease for potential reloading
90
  self.organism = organism
91
  self.endonuclease = endonuclease
92
-
93
  # Get CSPR parser from cache or create new one
94
  parser_start = time.time()
95
  cspr_key = f"{organism}_{endonuclease}"
96
  if cspr_key in self._parser_cache:
97
  self.cspr_parser = self._parser_cache[cspr_key]
 
98
  else:
99
  org_files = self.get_organism_to_files()
100
  if organism not in org_files or endonuclease not in org_files[organism]:
101
- self.global_settings.logger.error(f"No CSPR file found for {organism} and {endonuclease}")
102
  return
103
 
104
  cspr_file = org_files[organism][endonuclease][0]
105
  cspr_path = os.path.join(self.global_settings.get_db_path(), cspr_file)
106
  self.cspr_parser = CSPRparser(cspr_path, self.global_settings.get_casper_info_path())
107
  self._parser_cache[cspr_key] = self.cspr_parser
 
108
  parser_time = time.time() - parser_start
 
109
 
110
  # Initialize targets and genes
 
111
  self.targets = []
112
  self.available_genes = set()
 
 
113
 
114
- # Set up annotation parser if needed
115
- if self.annotation_parser is None:
116
- annotation_start = time.time()
117
- self.annotation_parser = AnnotationParser(self.global_settings)
118
- annotation_files = self.get_annotation_files()
119
- if annotation_files:
120
- self.annotation_path = os.path.join(self.global_settings.get_db_path(), 'GBFF', annotation_files[0])
121
- self.annotation_parser.set_annotation_file(self.annotation_path)
122
- annotation_time = time.time() - annotation_start
123
- else:
124
- annotation_time = 0
125
-
126
- # Process targets in batches by chromosome
127
- processing_start = time.time()
128
-
129
- # Group targets by chromosome and prepare batch reading
130
  batch_targets = defaultdict(list)
131
  for target in selected_targets:
132
  chrom = target['chromosome']
133
  start, end = map(int, target['location'].split('-'))
134
  batch_targets[chrom].append({
135
  'feature_name': target['feature_name'],
 
136
  'start': start,
137
  'end': end
138
  })
139
- self.available_genes.add(target['feature_name'])
 
 
 
140
 
141
- # Batch process targets for each chromosome
 
142
  target_count = 0
143
  for chrom, targets in batch_targets.items():
144
- self.chromosome = chrom
145
-
146
- # Sort targets by start position for more efficient reading
147
- targets.sort(key=lambda x: x['start'])
148
-
149
- # Read targets in a single batch per chromosome
150
- batch_results = self.cspr_parser.read_targets_batch(
151
- chromosome=chrom,
152
- targets=targets,
153
- endonuclease=endonuclease
154
- )
155
-
156
- if batch_results:
157
- self.targets.extend(batch_results)
158
- target_count += len(batch_results)
159
-
160
- processing_time = time.time() - processing_start
161
 
162
- # Convert genes to sorted list
163
- self.available_genes = sorted(list(self.available_genes))
 
164
 
165
- total_time = time.time() - start_time
166
- self.global_settings.logger.debug(f"Total load_targets execution time: {total_time:.2f} seconds")
167
- self.global_settings.logger.debug(f"Found {target_count} total CSPR targets")
168
-
169
  except Exception as e:
170
- self.global_settings.logger.error(f"Error in load_targets: {str(e)}\n{traceback.format_exc()}")
171
- raise
172
 
173
  def _get_chromosome_sequence(self, chromosome):
174
  """Get chromosome sequence on demand"""
@@ -190,118 +194,60 @@ class ViewTargetsModel(HomeWindowModel):
190
  if self.annotation_path:
191
  self.annotation_parser.set_annotation_file(self.annotation_path)
192
 
193
- def get_gene_data(self, gene_name):
194
- """Get gene data with caching"""
195
  try:
196
- if not gene_name:
197
- self.global_settings.logger.error("No gene name provided")
198
  return None
199
 
200
  # Check model cache first
201
- if gene_name in self._gene_data_cache:
202
- return self._gene_data_cache[gene_name]
 
 
 
 
 
 
 
 
203
 
204
- # Make sure annotation parser is initialized
205
- if self.annotation_parser is None:
206
- self._initialize_annotation_parser()
 
 
 
 
207
 
208
- # Get gene data from parser
209
- gene_data = self.annotation_parser.get_gene_data(gene_name)
210
  if gene_data:
211
- self._gene_data_cache[gene_name] = gene_data
 
 
 
212
 
213
  return gene_data
214
 
215
  except Exception as e:
216
- self.global_settings.logger.error(f"Error getting gene data: {str(e)}")
 
217
  return None
218
 
219
  def get_targets(self):
 
220
  return self.targets
221
 
222
- def highlight_targets_in_gene_viewer(self, selected_targets):
223
- """Highlight selected targets in gene viewer"""
224
- try:
225
- self.global_settings.logger.debug("Starting highlight_targets_in_gene_viewer")
226
- sequence = self.extended_sequence
227
- if not sequence:
228
- self.global_settings.logger.error("No extended sequence available")
229
- return sequence
230
-
231
- self.global_settings.logger.debug(f"Extended sequence length: {len(sequence)}")
232
-
233
- # Sort targets by position for efficient highlighting
234
- highlights = []
235
- for target in selected_targets:
236
- self.global_settings.logger.debug(f"Processing target: {target}")
237
- sequence_to_find = target['sequence']
238
- strand = target['strand']
239
-
240
- # For negative strand, we need to use reverse complement
241
- if strand == '-':
242
- sequence_to_find = str(Seq(sequence_to_find).reverse_complement())
243
- self.global_settings.logger.debug(f"Reverse complemented sequence: {sequence_to_find}")
244
-
245
- # Search for the sequence in the gene viewer text
246
- sequence_upper = sequence.upper()
247
- target_upper = sequence_to_find.upper()
248
-
249
- self.global_settings.logger.debug(f"Searching for sequence: {target_upper}")
250
-
251
- # Find all occurrences
252
- pos = sequence_upper.find(target_upper)
253
- if pos != -1:
254
- self.global_settings.logger.debug(f"Found sequence at position: {pos}")
255
- color = 'red' if strand == '-' else 'green'
256
- highlights.append((pos, len(sequence_to_find), color))
257
- else:
258
- self.global_settings.logger.warning(f"Sequence not found: {target_upper}")
259
-
260
- if not highlights:
261
- self.global_settings.logger.error("No sequences could be highlighted")
262
- return sequence
263
-
264
- self.global_settings.logger.debug(f"Found {len(highlights)} sequences to highlight")
265
-
266
- # Build highlighted sequence
267
- result = []
268
- last_pos = 0
269
- for pos, length, color in highlights:
270
- result.append(sequence[last_pos:pos])
271
- result.append(f"<span style='background-color: {color};'>")
272
- result.append(sequence[pos:pos+length])
273
- result.append("</span>")
274
- last_pos = pos + length
275
-
276
- result.append(sequence[last_pos:])
277
- final_sequence = ''.join(result)
278
-
279
- self.global_settings.logger.debug(f"Final highlighted sequence length: {len(final_sequence)}")
280
- return final_sequence
281
-
282
- except Exception as e:
283
- self.global_settings.logger.error(f"Error highlighting targets: {str(e)}\n{traceback.format_exc()}")
284
- return sequence
285
-
286
  def get_available_genes(self):
287
- """Get list of available genes from the loaded targets"""
288
  try:
289
- # Return the available genes list that was populated during load_targets
290
  if hasattr(self, 'available_genes'):
291
- return self.available_genes
292
-
293
- # If not already populated, get unique genes from targets
294
- genes = set()
295
- for target in self.targets:
296
- if 'feature_name' in target:
297
- genes.add(target['feature_name'])
298
-
299
- # Store for future use
300
- self.available_genes = sorted(list(genes))
301
- return self.available_genes
302
-
303
  except Exception as e:
304
- self.global_settings.logger.error(f"Error getting available genes: {str(e)}")
305
  return []
306
 
307
  # ... (other methods remain unchanged)
@@ -315,3 +261,90 @@ class ViewTargetsModel(HomeWindowModel):
315
  except Exception as e:
316
  logging.error(f"Error processing target: {e}")
317
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  self._chromosome_seqs = {}
41
  self._cached_targets = {} # Add cache for targets
42
 
43
+ # Connect to annotation file changes
44
+ self.global_settings.annotation_file_changed.connect(self._on_annotation_file_changed)
45
+
46
+ # Initialize annotation path
47
+ self.annotation_path = os.path.join(
48
+ self.global_settings.get_db_path(),
49
+ 'GBFF',
50
+ self.global_settings.get_current_annotation_file()
51
+ )
52
+ self.logger.debug(f"Initialized annotation path: {self.annotation_path}")
53
+
54
  def cleanup(self):
55
  """Cleanup method to be called when the view is closed"""
56
  try:
 
70
  def _on_annotation_file_changed(self, new_annotation_file):
71
  """Clear all caches when annotation file changes"""
72
  try:
73
+ self.logger.debug(f"ViewTargetsModel clearing caches for new annotation file: {new_annotation_file}")
74
  self._gene_data_cache.clear()
75
  self._sequence_cache.clear()
76
  self._parser_cache.clear()
 
88
  self._chromosome_seqs = {}
89
 
90
  except Exception as e:
91
+ self.logger.error(f"Error in _on_annotation_file_changed: {str(e)}")
92
 
93
  def load_targets(self, selected_targets, organism, endonuclease):
94
  """Fast target loading with minimal file operations"""
95
+ total_start = time.time()
96
 
97
  try:
98
+ self.logger.debug(f"Starting load_targets with {len(selected_targets)} targets")
99
 
100
  # Store organism and endonuclease for potential reloading
101
  self.organism = organism
102
  self.endonuclease = endonuclease
103
+
104
  # Get CSPR parser from cache or create new one
105
  parser_start = time.time()
106
  cspr_key = f"{organism}_{endonuclease}"
107
  if cspr_key in self._parser_cache:
108
  self.cspr_parser = self._parser_cache[cspr_key]
109
+ self.logger.debug("Using cached CSPR parser")
110
  else:
111
  org_files = self.get_organism_to_files()
112
  if organism not in org_files or endonuclease not in org_files[organism]:
113
+ self.logger.error(f"No CSPR file found for {organism} and {endonuclease}")
114
  return
115
 
116
  cspr_file = org_files[organism][endonuclease][0]
117
  cspr_path = os.path.join(self.global_settings.get_db_path(), cspr_file)
118
  self.cspr_parser = CSPRparser(cspr_path, self.global_settings.get_casper_info_path())
119
  self._parser_cache[cspr_key] = self.cspr_parser
120
+ self.logger.debug("Created new CSPR parser")
121
  parser_time = time.time() - parser_start
122
+ self.logger.debug(f"CSPR parser initialization time: {parser_time:.2f} seconds")
123
 
124
  # Initialize targets and genes
125
+ init_start = time.time()
126
  self.targets = []
127
  self.available_genes = set()
128
+ init_time = time.time() - init_start
129
+ self.logger.debug(f"Initialization time: {init_time:.2f} seconds")
130
 
131
+ # Group targets by chromosome
132
+ group_start = time.time()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  batch_targets = defaultdict(list)
134
  for target in selected_targets:
135
  chrom = target['chromosome']
136
  start, end = map(int, target['location'].split('-'))
137
  batch_targets[chrom].append({
138
  'feature_name': target['feature_name'],
139
+ 'feature_id': target['feature_id'], # Include feature_id (locus_tag)
140
  'start': start,
141
  'end': end
142
  })
143
+ # Store both feature_id and feature_name
144
+ self.available_genes.add((target['feature_id'], target['feature_name']))
145
+ group_time = time.time() - group_start
146
+ self.logger.debug(f"Target grouping time: {group_time:.2f} seconds")
147
 
148
+ # Process targets by chromosome
149
+ process_start = time.time()
150
  target_count = 0
151
  for chrom, targets in batch_targets.items():
152
+ batch_start = time.time()
153
+ results = self.cspr_parser.read_targets_batch(chrom, targets, endonuclease)
154
+ # Add feature_id to each result
155
+ for result in results:
156
+ # Find matching target to get feature_id
157
+ for target in targets:
158
+ if (target['start'] <= result['position'] <= target['end'] and
159
+ target['feature_name'] == result['feature_name']):
160
+ result['feature_id'] = target['feature_id']
161
+ break
162
+ self.targets.extend(results)
163
+ target_count += len(results)
164
+ batch_time = time.time() - batch_start
165
+ self.logger.debug(f"Chromosome {chrom} processing time: {batch_time:.2f} seconds")
166
+ process_time = time.time() - process_start
167
+ self.logger.debug(f"Total target processing time: {process_time:.2f} seconds")
 
168
 
169
+ total_time = time.time() - total_start
170
+ self.logger.debug(f"Total load_targets execution time: {total_time:.2f} seconds")
171
+ self.logger.debug(f"Found {target_count} total CSPR targets")
172
 
 
 
 
 
173
  except Exception as e:
174
+ self.logger.error(f"Error in load_targets: {str(e)}")
175
+ self.logger.error(f"Stack trace: {traceback.format_exc()}")
176
 
177
  def _get_chromosome_sequence(self, chromosome):
178
  """Get chromosome sequence on demand"""
 
194
  if self.annotation_path:
195
  self.annotation_parser.set_annotation_file(self.annotation_path)
196
 
197
+ def get_gene_data(self, locus_tag):
198
+ """Get gene data with proper error handling"""
199
  try:
200
+ if not locus_tag:
201
+ self.logger.debug("No locus tag provided")
202
  return None
203
 
204
  # Check model cache first
205
+ if locus_tag in self._gene_data_cache:
206
+ return self._gene_data_cache[locus_tag]
207
+
208
+ # Initialize annotation parser if not already done
209
+ if not hasattr(self, 'annotation_parser') or self.annotation_parser is None:
210
+ self.annotation_parser = AnnotationParser(self.global_settings)
211
+ annotation_file = self.global_settings.get_current_annotation_file()
212
+ annotation_path = os.path.join(self.global_settings.get_db_path(), 'GBFF', annotation_file)
213
+ self.annotation_parser.set_annotation_file(annotation_path)
214
+ self.logger.debug(f"Initialized annotation parser with file: {annotation_path}")
215
 
216
+ # Get gene data from parser with proper string conversion
217
+ gene_data = None
218
+ if isinstance(locus_tag, (str, int)):
219
+ locus_tag_str = str(locus_tag).strip()
220
+ self.logger.debug(f"Searching for locus tag: {locus_tag_str}")
221
+ # Look up by locus tag directly
222
+ gene_data = self.annotation_parser.get_gene_data(locus_tag_str.lower())
223
 
 
 
224
  if gene_data:
225
+ self._gene_data_cache[locus_tag] = gene_data
226
+ self.logger.debug(f"Found gene data: {gene_data.keys()}")
227
+ else:
228
+ self.logger.debug(f"No gene data found for locus tag: {locus_tag}")
229
 
230
  return gene_data
231
 
232
  except Exception as e:
233
+ self.logger.error(f"Error getting gene data: {str(e)}")
234
+ self.logger.error(f"Stack trace: {traceback.format_exc()}")
235
  return None
236
 
237
  def get_targets(self):
238
+ """Return all targets with their feature IDs"""
239
  return self.targets
240
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  def get_available_genes(self):
242
+ """Get list of available genes with format 'feature_id: feature_name'"""
243
  try:
 
244
  if hasattr(self, 'available_genes'):
245
+ # Format as "feature_id: feature_name"
246
+ return [f"{feature_id}: {feature_name}"
247
+ for feature_id, feature_name in sorted(self.available_genes)]
248
+ return []
 
 
 
 
 
 
 
 
249
  except Exception as e:
250
+ self.logger.error(f"Error getting available genes: {str(e)}")
251
  return []
252
 
253
  # ... (other methods remain unchanged)
 
261
  except Exception as e:
262
  logging.error(f"Error processing target: {e}")
263
  return None
264
+
265
+ def get_gene_sequence(self, locus_tag):
266
+ """Get gene sequence with optimized caching and minimal I/O"""
267
+ try:
268
+ # Check sequence cache first
269
+ cache_key = f"{locus_tag}_sequence"
270
+ if cache_key in self._sequence_cache:
271
+ self.logger.debug(f"Cache hit for gene sequence: {locus_tag}")
272
+ return self._sequence_cache[cache_key]
273
+
274
+ # Get gene data which includes location information
275
+ print(f"Getting gene data for locus tag: {locus_tag}")
276
+ gene_data = self.get_gene_data(locus_tag)
277
+ if not gene_data or 'info' not in gene_data:
278
+ self.logger.warning(f"No gene data found for locus tag: {locus_tag}")
279
+ return None
280
+
281
+ # Parse location string (format: "start:end(strand)")
282
+ location = gene_data['info']['location']
283
+ if ':' not in location:
284
+ self.logger.warning(f"Invalid location format: {location}")
285
+ return None
286
+
287
+ # Extract start and end positions
288
+ start = int(location.split(':')[0])
289
+ end = int(location.split(':')[1].split('(')[0])
290
+ chromosome = gene_data['info']['chromosome']
291
+
292
+ # Get sequence from gene_data directly if available
293
+ if 'sequence' in gene_data:
294
+ sequence = gene_data['sequence']
295
+
296
+ # Add padding (30 bases on each side)
297
+ padding = 30
298
+ seq_start = max(0, start - padding)
299
+ seq_end = min(len(sequence), end + padding)
300
+
301
+ # Get sequence with padding
302
+ five_prime_pad = sequence[seq_start:start].lower() if seq_start < start else ""
303
+ main_seq = sequence[start:end].upper()
304
+ three_prime_pad = sequence[end:seq_end].lower() if end < seq_end else ""
305
+
306
+ full_sequence = five_prime_pad + main_seq + three_prime_pad
307
+
308
+ # Cache the result
309
+ result = {
310
+ 'sequence': full_sequence,
311
+ 'chrom_length': len(sequence),
312
+ 'start': start,
313
+ 'end': end,
314
+ 'padded_start': seq_start,
315
+ 'padded_end': seq_end
316
+ }
317
+ self._sequence_cache[cache_key] = result
318
+
319
+ self.logger.debug(f"Retrieved and cached sequence for locus tag {locus_tag} ({len(full_sequence)} bp)")
320
+ return result
321
+
322
+ except Exception as e:
323
+ self.logger.error(f"Error getting gene sequence: {str(e)}")
324
+ self.logger.error(f"Stack trace: {traceback.format_exc()}")
325
+ return None
326
+
327
+ def get_scoring_options(self):
328
+ """Get current scoring options"""
329
+ try:
330
+ if not hasattr(self, 'scoring_options'):
331
+ self.scoring_options = {
332
+ 'algorithm': 'Azimuth 2.0',
333
+ 'fasta_file': '',
334
+ 'min_score': 0,
335
+ 'max_score': 100
336
+ }
337
+ return self.scoring_options
338
+
339
+ except Exception as e:
340
+ self.logger.error(f"Error getting scoring options: {str(e)}")
341
+ return {}
342
+
343
+ def set_scoring_options(self, options):
344
+ """Set scoring options"""
345
+ try:
346
+ self.scoring_options = options
347
+ self.logger.debug(f"Updated scoring options: {options}")
348
+
349
+ except Exception as e:
350
+ self.logger.error(f"Error setting scoring options: {str(e)}")
src/ui/new_endonuclease_window.ui CHANGED
@@ -20,7 +20,7 @@
20
  </property>
21
  <widget class="QWidget" name="centralwidget">
22
  <layout class="QGridLayout" name="gridLayout_2">
23
- <item row="11" column="0">
24
  <layout class="QHBoxLayout" name="boxlayhbotButtons">
25
  <property name="sizeConstraint">
26
  <enum>QLayout::SetDefaultConstraint</enum>
@@ -86,7 +86,7 @@
86
  </item>
87
  </layout>
88
  </item>
89
- <item row="10" column="0">
90
  <layout class="QGridLayout" name="gridLayout">
91
  <item row="0" column="0">
92
  <layout class="QVBoxLayout" name="verticalLayout">
@@ -194,6 +194,33 @@
194
  </item>
195
  </layout>
196
  </item>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  </layout>
198
  </widget>
199
  </item>
@@ -459,39 +486,6 @@
459
  </item>
460
  </layout>
461
  </item>
462
- <item row="6" column="0">
463
- <layout class="QHBoxLayout" name="horizontalLayout_2">
464
- <property name="spacing">
465
- <number>-1</number>
466
- </property>
467
- <property name="sizeConstraint">
468
- <enum>QLayout::SetDefaultConstraint</enum>
469
- </property>
470
- <property name="bottomMargin">
471
- <number>0</number>
472
- </property>
473
- <item>
474
- <spacer name="horizontalSpacer">
475
- <property name="orientation">
476
- <enum>Qt::Horizontal</enum>
477
- </property>
478
- <property name="sizeHint" stdset="0">
479
- <size>
480
- <width>40</width>
481
- <height>20</height>
482
- </size>
483
- </property>
484
- </spacer>
485
- </item>
486
- <item>
487
- <widget class="QLabel" name="lblRequired">
488
- <property name="text">
489
- <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;&lt;span style=&quot; color:#fc0107;&quot;&gt;* Required&lt;/span&gt;&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
490
- </property>
491
- </widget>
492
- </item>
493
- </layout>
494
- </item>
495
  </layout>
496
  </widget>
497
  </widget>
 
20
  </property>
21
  <widget class="QWidget" name="centralwidget">
22
  <layout class="QGridLayout" name="gridLayout_2">
23
+ <item row="10" column="0">
24
  <layout class="QHBoxLayout" name="boxlayhbotButtons">
25
  <property name="sizeConstraint">
26
  <enum>QLayout::SetDefaultConstraint</enum>
 
86
  </item>
87
  </layout>
88
  </item>
89
+ <item row="9" column="0">
90
  <layout class="QGridLayout" name="gridLayout">
91
  <item row="0" column="0">
92
  <layout class="QVBoxLayout" name="verticalLayout">
 
194
  </item>
195
  </layout>
196
  </item>
197
+ <item row="0" column="0">
198
+ <layout class="QHBoxLayout" name="horizontalLayout_2">
199
+ <property name="bottomMargin">
200
+ <number>0</number>
201
+ </property>
202
+ <item>
203
+ <spacer name="horizontalSpacer">
204
+ <property name="orientation">
205
+ <enum>Qt::Horizontal</enum>
206
+ </property>
207
+ <property name="sizeHint" stdset="0">
208
+ <size>
209
+ <width>40</width>
210
+ <height>20</height>
211
+ </size>
212
+ </property>
213
+ </spacer>
214
+ </item>
215
+ <item>
216
+ <widget class="QLabel" name="lblRequired">
217
+ <property name="text">
218
+ <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;&lt;span style=&quot; color:#fc0107;&quot;&gt;* Required&lt;/span&gt;&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
219
+ </property>
220
+ </widget>
221
+ </item>
222
+ </layout>
223
+ </item>
224
  </layout>
225
  </widget>
226
  </item>
 
486
  </item>
487
  </layout>
488
  </item>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
489
  </layout>
490
  </widget>
491
  </widget>
src/ui/{scoring_window.ui → scoring_options.ui} RENAMED
@@ -15,56 +15,8 @@
15
  </property>
16
  <widget class="QWidget" name="centralwidget">
17
  <layout class="QGridLayout" name="gridLayout">
18
- <item row="1" column="2">
19
- <spacer name="horizontalSpacer_2">
20
- <property name="orientation">
21
- <enum>Qt::Horizontal</enum>
22
- </property>
23
- <property name="sizeType">
24
- <enum>QSizePolicy::Fixed</enum>
25
- </property>
26
- <property name="sizeHint" stdset="0">
27
- <size>
28
- <width>5</width>
29
- <height>20</height>
30
- </size>
31
- </property>
32
- </spacer>
33
- </item>
34
- <item row="2" column="1">
35
- <spacer name="verticalSpacer">
36
- <property name="orientation">
37
- <enum>Qt::Vertical</enum>
38
- </property>
39
- <property name="sizeType">
40
- <enum>QSizePolicy::Fixed</enum>
41
- </property>
42
- <property name="sizeHint" stdset="0">
43
- <size>
44
- <width>20</width>
45
- <height>5</height>
46
- </size>
47
- </property>
48
- </spacer>
49
- </item>
50
- <item row="1" column="0">
51
- <spacer name="horizontalSpacer">
52
- <property name="orientation">
53
- <enum>Qt::Horizontal</enum>
54
- </property>
55
- <property name="sizeType">
56
- <enum>QSizePolicy::Fixed</enum>
57
- </property>
58
- <property name="sizeHint" stdset="0">
59
- <size>
60
- <width>5</width>
61
- <height>20</height>
62
- </size>
63
- </property>
64
- </spacer>
65
- </item>
66
- <item row="1" column="1">
67
- <widget class="QGroupBox" name="groupBox">
68
  <property name="sizePolicy">
69
  <sizepolicy hsizetype="Minimum" vsizetype="Minimum">
70
  <horstretch>0</horstretch>
@@ -76,7 +28,7 @@
76
  </property>
77
  <layout class="QGridLayout" name="gridLayout_2">
78
  <item row="0" column="0" colspan="4">
79
- <widget class="QLabel" name="label">
80
  <property name="sizePolicy">
81
  <sizepolicy hsizetype="Preferred" vsizetype="Minimum">
82
  <horstretch>0</horstretch>
@@ -92,21 +44,21 @@
92
  </widget>
93
  </item>
94
  <item row="1" column="3">
95
- <widget class="QPushButton" name="browse_button">
96
  <property name="text">
97
  <string>Browse...</string>
98
  </property>
99
  </widget>
100
  </item>
101
  <item row="2" column="0">
102
- <widget class="QLabel" name="label_2">
103
  <property name="text">
104
  <string>Select Algorithm:</string>
105
  </property>
106
  </widget>
107
  </item>
108
  <item row="1" column="0">
109
- <widget class="QLabel" name="fasta_label">
110
  <property name="sizePolicy">
111
  <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
112
  <horstretch>0</horstretch>
@@ -122,7 +74,7 @@
122
  </widget>
123
  </item>
124
  <item row="2" column="1">
125
- <widget class="QRadioButton" name="azimuth_button">
126
  <property name="text">
127
  <string>Azimuth 2.0</string>
128
  </property>
@@ -131,22 +83,15 @@
131
  </property>
132
  </widget>
133
  </item>
134
- <item row="4" column="0" colspan="3">
135
- <widget class="QProgressBar" name="progressBar">
136
- <property name="value">
137
- <number>0</number>
138
- </property>
139
- </widget>
140
- </item>
141
  <item row="4" column="3">
142
- <widget class="QPushButton" name="submit_button">
143
  <property name="text">
144
  <string>Submit</string>
145
  </property>
146
  </widget>
147
  </item>
148
  <item row="1" column="1" colspan="2">
149
- <widget class="QLineEdit" name="fasta_edit">
150
  <property name="readOnly">
151
  <bool>true</bool>
152
  </property>
@@ -155,22 +100,6 @@
155
  </layout>
156
  </widget>
157
  </item>
158
- <item row="0" column="1">
159
- <spacer name="verticalSpacer_3">
160
- <property name="orientation">
161
- <enum>Qt::Vertical</enum>
162
- </property>
163
- <property name="sizeType">
164
- <enum>QSizePolicy::Fixed</enum>
165
- </property>
166
- <property name="sizeHint" stdset="0">
167
- <size>
168
- <width>20</width>
169
- <height>5</height>
170
- </size>
171
- </property>
172
- </spacer>
173
- </item>
174
  </layout>
175
  </widget>
176
  </widget>
 
15
  </property>
16
  <widget class="QWidget" name="centralwidget">
17
  <layout class="QGridLayout" name="gridLayout">
18
+ <item row="0" column="0">
19
+ <widget class="QGroupBox" name="grpSelectScoring">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  <property name="sizePolicy">
21
  <sizepolicy hsizetype="Minimum" vsizetype="Minimum">
22
  <horstretch>0</horstretch>
 
28
  </property>
29
  <layout class="QGridLayout" name="gridLayout_2">
30
  <item row="0" column="0" colspan="4">
31
+ <widget class="QLabel" name="lblSelectFASTAfile">
32
  <property name="sizePolicy">
33
  <sizepolicy hsizetype="Preferred" vsizetype="Minimum">
34
  <horstretch>0</horstretch>
 
44
  </widget>
45
  </item>
46
  <item row="1" column="3">
47
+ <widget class="QPushButton" name="pbtnBrowse">
48
  <property name="text">
49
  <string>Browse...</string>
50
  </property>
51
  </widget>
52
  </item>
53
  <item row="2" column="0">
54
+ <widget class="QLabel" name="lblSelectAlgorithm">
55
  <property name="text">
56
  <string>Select Algorithm:</string>
57
  </property>
58
  </widget>
59
  </item>
60
  <item row="1" column="0">
61
+ <widget class="QLabel" name="lblInputFasta">
62
  <property name="sizePolicy">
63
  <sizepolicy hsizetype="Minimum" vsizetype="Preferred">
64
  <horstretch>0</horstretch>
 
74
  </widget>
75
  </item>
76
  <item row="2" column="1">
77
+ <widget class="QRadioButton" name="rbtnAzimuth">
78
  <property name="text">
79
  <string>Azimuth 2.0</string>
80
  </property>
 
83
  </property>
84
  </widget>
85
  </item>
 
 
 
 
 
 
 
86
  <item row="4" column="3">
87
+ <widget class="QPushButton" name="pbtnSubmit">
88
  <property name="text">
89
  <string>Submit</string>
90
  </property>
91
  </widget>
92
  </item>
93
  <item row="1" column="1" colspan="2">
94
+ <widget class="QLineEdit" name="ledInputFASTA">
95
  <property name="readOnly">
96
  <bool>true</bool>
97
  </property>
 
100
  </layout>
101
  </widget>
102
  </item>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  </layout>
104
  </widget>
105
  </widget>
src/ui/view_targets.ui CHANGED
@@ -6,7 +6,7 @@
6
  <rect>
7
  <x>0</x>
8
  <y>0</y>
9
- <width>1295</width>
10
  <height>916</height>
11
  </rect>
12
  </property>
@@ -115,8 +115,8 @@
115
  <string>Guide Viewer</string>
116
  </property>
117
  <layout class="QGridLayout" name="gridLayout_4">
118
- <item row="8" column="0">
119
- <widget class="QCheckBox" name="chkSelectAll">
120
  <property name="sizePolicy">
121
  <sizepolicy hsizetype="Fixed" vsizetype="Fixed">
122
  <horstretch>0</horstretch>
@@ -124,14 +124,21 @@
124
  </sizepolicy>
125
  </property>
126
  <property name="text">
127
- <string>Select All</string>
128
  </property>
129
  </widget>
130
  </item>
131
- <item row="4" column="1" colspan="4">
132
- <layout class="QHBoxLayout" name="horizontalLayout">
 
 
 
 
 
 
 
133
  <item>
134
- <widget class="QComboBox" name="cmbGene">
135
  <property name="sizePolicy">
136
  <sizepolicy hsizetype="Expanding" vsizetype="Fixed">
137
  <horstretch>0</horstretch>
@@ -148,8 +155,27 @@
148
  </item>
149
  </layout>
150
  </item>
151
- <item row="9" column="0" colspan="5">
152
- <widget class="QTableWidget" name="tblTargets"/>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  </item>
154
  <item row="5" column="0">
155
  <widget class="QLabel" name="lblEndonuclease">
@@ -164,32 +190,26 @@
164
  </property>
165
  </widget>
166
  </item>
167
- <item row="8" column="1" alignment="Qt::AlignLeft">
168
- <widget class="QPushButton" name="pbtnFilterOptions">
 
 
 
169
  <property name="sizePolicy">
170
- <sizepolicy hsizetype="Preferred" vsizetype="Fixed">
171
  <horstretch>0</horstretch>
172
  <verstretch>0</verstretch>
173
  </sizepolicy>
174
  </property>
175
- <property name="minimumSize">
176
- <size>
177
- <width>125</width>
178
- <height>0</height>
179
- </size>
180
- </property>
181
- <property name="toolTip">
182
- <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;&lt;span style=&quot; font-size:12pt;&quot;&gt;Additional options for filtering the Guide Viewer Table.&lt;/span&gt;&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
183
- </property>
184
  <property name="text">
185
- <string>Filter Options</string>
186
  </property>
187
  </widget>
188
  </item>
189
- <item row="5" column="1" colspan="4">
190
- <layout class="QHBoxLayout" name="horizontalLayout_2">
191
  <item>
192
- <widget class="QComboBox" name="cmbEndonuclease">
193
  <property name="sizePolicy">
194
  <sizepolicy hsizetype="Expanding" vsizetype="Fixed">
195
  <horstretch>0</horstretch>
@@ -206,26 +226,6 @@
206
  </item>
207
  </layout>
208
  </item>
209
- <item row="8" column="2">
210
- <widget class="QPushButton" name="pbtnScoringOptions">
211
- <property name="text">
212
- <string>Scoring Options</string>
213
- </property>
214
- </widget>
215
- </item>
216
- <item row="4" column="0">
217
- <widget class="QLabel" name="lblGene">
218
- <property name="sizePolicy">
219
- <sizepolicy hsizetype="Fixed" vsizetype="Fixed">
220
- <horstretch>0</horstretch>
221
- <verstretch>0</verstretch>
222
- </sizepolicy>
223
- </property>
224
- <property name="text">
225
- <string>Gene:</string>
226
- </property>
227
- </widget>
228
- </item>
229
  </layout>
230
  </widget>
231
  </item>
 
6
  <rect>
7
  <x>0</x>
8
  <y>0</y>
9
+ <width>1315</width>
10
  <height>916</height>
11
  </rect>
12
  </property>
 
115
  <string>Guide Viewer</string>
116
  </property>
117
  <layout class="QGridLayout" name="gridLayout_4">
118
+ <item row="4" column="0">
119
+ <widget class="QLabel" name="lblGene">
120
  <property name="sizePolicy">
121
  <sizepolicy hsizetype="Fixed" vsizetype="Fixed">
122
  <horstretch>0</horstretch>
 
124
  </sizepolicy>
125
  </property>
126
  <property name="text">
127
+ <string>Gene:</string>
128
  </property>
129
  </widget>
130
  </item>
131
+ <item row="8" column="2">
132
+ <widget class="QPushButton" name="pbtnScoringOptions">
133
+ <property name="text">
134
+ <string>Scoring Options</string>
135
+ </property>
136
+ </widget>
137
+ </item>
138
+ <item row="5" column="1" colspan="4">
139
+ <layout class="QHBoxLayout" name="horizontalLayout_2">
140
  <item>
141
+ <widget class="QComboBox" name="cmbEndonuclease">
142
  <property name="sizePolicy">
143
  <sizepolicy hsizetype="Expanding" vsizetype="Fixed">
144
  <horstretch>0</horstretch>
 
155
  </item>
156
  </layout>
157
  </item>
158
+ <item row="8" column="1" alignment="Qt::AlignLeft">
159
+ <widget class="QPushButton" name="pbtnFilterOptions">
160
+ <property name="sizePolicy">
161
+ <sizepolicy hsizetype="Preferred" vsizetype="Fixed">
162
+ <horstretch>0</horstretch>
163
+ <verstretch>0</verstretch>
164
+ </sizepolicy>
165
+ </property>
166
+ <property name="minimumSize">
167
+ <size>
168
+ <width>125</width>
169
+ <height>0</height>
170
+ </size>
171
+ </property>
172
+ <property name="toolTip">
173
+ <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p&gt;&lt;span style=&quot; font-size:12pt;&quot;&gt;Additional options for filtering the Guide Viewer Table.&lt;/span&gt;&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
174
+ </property>
175
+ <property name="text">
176
+ <string>Filter Options</string>
177
+ </property>
178
+ </widget>
179
  </item>
180
  <item row="5" column="0">
181
  <widget class="QLabel" name="lblEndonuclease">
 
190
  </property>
191
  </widget>
192
  </item>
193
+ <item row="9" column="0" colspan="5">
194
+ <widget class="QTableWidget" name="tblTargets"/>
195
+ </item>
196
+ <item row="8" column="0">
197
+ <widget class="QCheckBox" name="chkSelectAll">
198
  <property name="sizePolicy">
199
+ <sizepolicy hsizetype="Fixed" vsizetype="Fixed">
200
  <horstretch>0</horstretch>
201
  <verstretch>0</verstretch>
202
  </sizepolicy>
203
  </property>
 
 
 
 
 
 
 
 
 
204
  <property name="text">
205
+ <string>Select All</string>
206
  </property>
207
  </widget>
208
  </item>
209
+ <item row="4" column="1" colspan="4">
210
+ <layout class="QHBoxLayout" name="horizontalLayout">
211
  <item>
212
+ <widget class="QComboBox" name="cmbGene">
213
  <property name="sizePolicy">
214
  <sizepolicy hsizetype="Expanding" vsizetype="Fixed">
215
  <horstretch>0</horstretch>
 
226
  </item>
227
  </layout>
228
  </item>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  </layout>
230
  </widget>
231
  </item>
src/utils/LoggingMixin.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Optional
3
+
4
+ class LoggingMixin:
5
+ """Mixin class to provide logging functionality to classes"""
6
+
7
+ def __init__(self):
8
+ self._logger: Optional[logging.Logger] = None
9
+ self._init_logger()
10
+
11
+ def _init_logger(self) -> None:
12
+ """Initialize logger for the class"""
13
+ self._logger = logging.getLogger(self.__class__.__name__)
14
+
15
+ @property
16
+ def logger(self) -> logging.Logger:
17
+ """Get the logger instance"""
18
+ if self._logger is None:
19
+ self._init_logger()
20
+ return self._logger
21
+
22
+ def log_method_call(self, method_name: str, *args, **kwargs) -> None:
23
+ """Log method calls with their arguments"""
24
+ self.logger.debug(f"Calling {method_name} with args: {args}, kwargs: {kwargs}")
25
+
26
+ def log_error(self, method_name: str, error: Exception) -> None:
27
+ """Log errors with method context"""
28
+ self.logger.error(f"Error in {method_name}: {str(error)}", exc_info=True)
29
+
30
+ def log_info(self, message: str) -> None:
31
+ """Log info messages"""
32
+ self.logger.info(message)
33
+
34
+ def log_debug(self, message: str) -> None:
35
+ """Log debug messages"""
36
+ self.logger.debug(message)
37
+
38
+ def log_warning(self, message: str) -> None:
39
+ """Log warning messages"""
40
+ self.logger.warning(message)
src/utils/azimuth/__init__.py ADDED
File without changes
src/utils/azimuth/features/__init__.py ADDED
File without changes
src/utils/azimuth/features/featurization.py ADDED
@@ -0,0 +1,546 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas
2
+ import time
3
+ import sklearn
4
+ import numpy as np
5
+ import Bio.SeqUtils as SeqUtil
6
+ import Bio.Seq as Seq
7
+ import azimuth.util
8
+ import sys
9
+ from Bio.SeqUtils import MeltingTemp as Tm
10
+ import pickle
11
+ import itertools
12
+
13
+ def featurize_data(data, learn_options, Y, gene_position, pam_audit=True, length_audit=True, quiet=True):
14
+ '''
15
+ assumes that data contains the 30mer
16
+ returns set of features from which one can make a kernel for each one
17
+ '''
18
+ all_lens = data['30mer'].apply(len).values
19
+ unique_lengths = np.unique(all_lens)
20
+ num_lengths = len(unique_lengths)
21
+ assert num_lengths == 1, "should only have sequences of a single length, but found %s: %s" % (num_lengths, str(unique_lengths))
22
+
23
+ if not quiet:
24
+ print("Constructing features...")
25
+ t0 = time.time()
26
+
27
+ feature_sets = {}
28
+
29
+ if learn_options["nuc_features"]:
30
+ # spectrum kernels (position-independent) and weighted degree kernels (position-dependent)
31
+ get_all_order_nuc_features(data['30mer'], feature_sets, learn_options, learn_options["order"], max_index_to_use=30, quiet=quiet)
32
+
33
+ check_feature_set(feature_sets)
34
+
35
+ if learn_options["gc_features"]:
36
+ gc_above_10, gc_below_10, gc_count = gc_features(data, length_audit)
37
+ feature_sets['gc_above_10'] = pandas.DataFrame(gc_above_10)
38
+ feature_sets['gc_below_10'] = pandas.DataFrame(gc_below_10)
39
+ feature_sets['gc_count'] = pandas.DataFrame(gc_count)
40
+
41
+ if learn_options["include_gene_position"]:
42
+ # gene_position_columns = ["Amino Acid Cut position", "Percent Peptide", "Nucleotide cut position"]
43
+ # gene_position_columns = ["Percent Peptide", "Nucleotide cut position"]
44
+
45
+ for set in gene_position.columns:
46
+ set_name = set
47
+ feature_sets[set_name] = pandas.DataFrame(gene_position[set])
48
+ feature_sets["Percent Peptide <50%"] = feature_sets["Percent Peptide"] < 50
49
+ feature_sets["Percent Peptide <50%"]['Percent Peptide <50%'] = feature_sets["Percent Peptide <50%"].pop("Percent Peptide")
50
+
51
+ if learn_options["include_gene_effect"]:
52
+ print("including gene effect")
53
+ gene_names = Y['Target gene']
54
+ enc = sklearn.preprocessing.OneHotEncoder()
55
+ label_encoder = sklearn.preprocessing.LabelEncoder()
56
+ label_encoder.fit(gene_names)
57
+ one_hot_genes = np.array(enc.fit_transform(label_encoder.transform(gene_names)[:, None]).todense())
58
+ feature_sets["gene effect"] = pandas.DataFrame(one_hot_genes,
59
+ columns=["gene_%d" % i for i in range(one_hot_genes.shape[1])], index=gene_names.index)
60
+
61
+ if learn_options['include_known_pairs']:
62
+ feature_sets['known pairs'] = pandas.DataFrame(Y['test'])
63
+
64
+ if learn_options["include_NGGX_interaction"]:
65
+ feature_sets["NGGX"] = NGGX_interaction_feature(data, pam_audit)
66
+
67
+ if learn_options["include_Tm"]:
68
+ feature_sets["Tm"] = Tm_feature(data, pam_audit, learn_options=None)
69
+
70
+ if learn_options["include_sgRNAscore"]:
71
+ feature_sets["sgRNA Score"] = pandas.DataFrame(data["sgRNA Score"])
72
+
73
+ if learn_options["include_drug"]:
74
+ # feature_sets["drug"] = pandas.DataFrame(data["drug"])
75
+ drug_names = Y.index.get_level_values('drug').tolist()
76
+ enc = sklearn.preprocessing.OneHotEncoder()
77
+ label_encoder = sklearn.preprocessing.LabelEncoder()
78
+ label_encoder.fit(drug_names)
79
+ one_hot_drugs = np.array(enc.fit_transform(label_encoder.transform(drug_names)[:, None]).todense())
80
+ feature_sets["drug"] = pandas.DataFrame(one_hot_drugs, columns=["drug_%d" % i for i in range(one_hot_drugs.shape[1])], index=drug_names)
81
+
82
+ if learn_options['include_strand']:
83
+ feature_sets['Strand effect'] = (pandas.DataFrame(data['Strand']) == 'sense')*1
84
+
85
+ if learn_options["include_gene_feature"]:
86
+ feature_sets["gene features"] = gene_feature(Y, data, learn_options)
87
+
88
+ if learn_options["include_gene_guide_feature"] > 0:
89
+ tmp_feature_sets = gene_guide_feature(Y, data, learn_options)
90
+ for key in tmp_feature_sets:
91
+ feature_sets[key] = tmp_feature_sets[key]
92
+
93
+ if learn_options["include_microhomology"]:
94
+ feature_sets["microhomology"] = get_micro_homology_features(Y['Target gene'], learn_options, data)
95
+
96
+ t1 = time.time()
97
+ if not quiet:
98
+ print("\t\tElapsed time for constructing features is %.2f seconds" % (t1-t0))
99
+
100
+ check_feature_set(feature_sets)
101
+
102
+ if learn_options['normalize_features']:
103
+ assert("should not be here as doesn't make sense when we make one-off predictions, but could make sense for internal model comparisons when using regularized models")
104
+ feature_sets = normalize_feature_sets(feature_sets)
105
+ check_feature_set(feature_sets)
106
+
107
+ return feature_sets
108
+
109
+
110
+ def check_feature_set(feature_sets):
111
+ '''
112
+ Ensure the # of people is the same in each feature set
113
+ '''
114
+ assert feature_sets != {}, "no feature sets present"
115
+
116
+ N = None
117
+ for ft in list(feature_sets.keys()):
118
+ N2 = feature_sets[ft].shape[0]
119
+ if N is None:
120
+ N = N2
121
+ else:
122
+ assert N >= 1, "should be at least one individual"
123
+ assert N == N2, "# of individuals do not match up across feature sets"
124
+
125
+ for set in list(feature_sets.keys()):
126
+ if np.any(np.isnan(feature_sets[set])):
127
+ raise Exception("found Nan in set %s" % set)
128
+
129
+
130
+ def NGGX_interaction_feature(data, pam_audit=True):
131
+ '''
132
+ assuming 30-mer, grab the NGGX _ _ positions, and make a one-hot
133
+ encoding of the NX nucleotides yielding 4x4=16 features
134
+ '''
135
+ sequence = data['30mer'].values
136
+ feat_NX = pandas.DataFrame()
137
+ # check that GG is where we think
138
+ for seq in sequence:
139
+ if pam_audit and seq[25:27] != "GG":
140
+ raise Exception("expected GG but found %s" % seq[25:27])
141
+ NX = seq[24]+seq[27]
142
+ NX_onehot = nucleotide_features(NX,order=2, feature_type='pos_dependent', max_index_to_use=2, prefix="NGGX")
143
+ # NX_onehot[:] = np.random.rand(NX_onehot.shape[0]) ##TESTING RANDOM FEATURE
144
+ feat_NX = pandas.concat([feat_NX, NX_onehot], axis=1)
145
+ return feat_NX.T
146
+
147
+
148
+ def get_all_order_nuc_features(data, feature_sets, learn_options, maxorder, max_index_to_use, prefix="", quiet=False):
149
+ for order in range(1, maxorder+1):
150
+ if not quiet:
151
+ print("\t\tconstructing order %s features" % order)
152
+ nuc_features_pd, nuc_features_pi = apply_nucleotide_features(data, order, learn_options["num_proc"],
153
+ include_pos_independent=True, max_index_to_use=max_index_to_use, prefix=prefix)
154
+ feature_sets['%s_nuc_pd_Order%i' % (prefix, order)] = nuc_features_pd
155
+ if learn_options['include_pi_nuc_feat']:
156
+ feature_sets['%s_nuc_pi_Order%i' % (prefix, order)] = nuc_features_pi
157
+ check_feature_set(feature_sets)
158
+
159
+ if not quiet:
160
+ print("\t\t\t\t\t\t\tdone")
161
+
162
+
163
+ def countGC(s, length_audit=True):
164
+ '''
165
+ GC content for only the 20mer, as per the Doench paper/code
166
+ '''
167
+ if length_audit:
168
+ assert len(s) == 30, "seems to assume 30mer"
169
+ return len(s[4:24].replace('A', '').replace('T', ''))
170
+
171
+
172
+ def SeqUtilFeatures(data):
173
+ '''
174
+ assuming '30-mer'is a key
175
+ get melting temperature features from:
176
+ 0-the 30-mer ("global Tm")
177
+ 1-the Tm (melting temperature) of the DNA:RNA hybrid from positions 16 - 20 of the sgRNA, i.e. the 5nts immediately proximal of the NGG PAM
178
+ 2-the Tm of the DNA:RNA hybrid from position 8 - 15 (i.e. 8 nt)
179
+ 3-the Tm of the DNA:RNA hybrid from position 3 - 7 (i.e. 5 nt)
180
+ '''
181
+ sequence = data['30mer'].values
182
+ num_features = 1
183
+ featarray = np.ones((sequence.shape[0], num_features))
184
+ for i, seq in enumerate(sequence):
185
+ assert len(seq) == 30, "seems to assume 30mer"
186
+ featarray[i, 0] = SeqUtil.molecular_weight(str(seq))
187
+
188
+ feat = pandas.DataFrame(pandas.DataFrame(featarray))
189
+ return feat
190
+
191
+
192
+ def organism_feature(data):
193
+ '''
194
+ Human vs. mouse
195
+ '''
196
+ organism = np.array(data['Organism'].values)
197
+ feat = pandas.DataFrame(pandas.DataFrame(featarray))
198
+ import ipdb; ipdb.set_trace()
199
+ return feat
200
+
201
+
202
+ def get_micro_homology_features(gene_names, learn_options, X):
203
+ # originally was flipping the guide itself as necessary, but now flipping the gene instead
204
+
205
+ print("building microhomology features")
206
+ feat = pandas.DataFrame(index=X.index)
207
+ feat["mh_score"] = ""
208
+ feat["oof_score"] = ""
209
+
210
+ #with open(r"tmp\V%s_gene_mismatches.csv" % learn_options["V"],'wb') as f:
211
+ if True:
212
+ # number of nulceotides to take to the left and right of the guide
213
+ k_mer_length_left = 9
214
+ k_mer_length_right = 21
215
+ for gene in gene_names.unique():
216
+ gene_seq = Seq.Seq(util.get_gene_sequence(gene)).reverse_complement()
217
+ guide_inds = np.where(gene_names.values == gene)[0]
218
+ print("getting microhomology for all %d guides in gene %s" % (len(guide_inds), gene))
219
+ for j, ps in enumerate(guide_inds):
220
+ guide_seq = Seq.Seq(X['30mer'][ps])
221
+ strand = X['Strand'][ps]
222
+ if strand=='sense':
223
+ gene_seq = gene_seq.reverse_complement()
224
+ # figure out the sequence to the left and right of this guide, in the gene
225
+ ind = gene_seq.find(guide_seq)
226
+ if ind==-1:
227
+ gene_seq = gene_seq.reverse_complement()
228
+ ind = gene_seq.find(guide_seq)
229
+ #assert ind != -1, "still didn't work"
230
+ #print "shouldn't get here"
231
+ else:
232
+ #print "all good"
233
+ pass
234
+ #assert ind != -1, "could not find guide in gene"
235
+ if ind==-1:
236
+ #print "***could not find guide %s for gene %s" % (str(guide_seq), str(gene))
237
+ #if.write(str(gene) + "," + str(guide_seq))
238
+ mh_score = 0
239
+ oof_score = 0
240
+ else:
241
+ #print "worked"
242
+
243
+ assert gene_seq[ind:(ind+len(guide_seq))]==guide_seq, "match not right"
244
+ left_win = gene_seq[(ind - k_mer_length_left):ind]
245
+ right_win = gene_seq[(ind + len(guide_seq)):(ind + len(guide_seq) + k_mer_length_right)]
246
+
247
+ #if strand=='antisense':
248
+ # # it's arbitrary which of sense and anti-sense we flip, we just want
249
+ # # to keep them in the same relative alphabet/direction
250
+ # left_win = left_win.reverse_complement()
251
+ # right_win = right_win.reverse_complement()
252
+ assert len(left_win.tostring())==k_mer_length_left
253
+ assert len(right_win.tostring())==k_mer_length_right
254
+
255
+ sixtymer = str(left_win) + str(guide_seq) + str(right_win)
256
+ assert len(sixtymer)==60, "should be of length 60"
257
+ mh_score, oof_score = microhomology.compute_score(sixtymer)
258
+
259
+ feat.ix[ps,"mh_score"] = mh_score
260
+ feat.ix[ps,"oof_score"] = oof_score
261
+ print("computed microhomology of %s" % (str(gene)))
262
+
263
+ return pandas.DataFrame(feat, dtype='float')
264
+
265
+
266
+ def local_gene_seq_features(gene_names, learn_options, X):
267
+
268
+ print("building local gene sequence features")
269
+ feat = pandas.DataFrame(index=X.index)
270
+ feat["gene_left_win"] = ""
271
+ feat["gene_right_win"] = ""
272
+
273
+ # number of nulceotides to take to the left and right of the guide
274
+ k_mer_length = learn_options['include_gene_guide_feature']
275
+ for gene in gene_names.unique():
276
+ gene_seq = Seq.Seq(util.get_gene_sequence(gene)).reverse_complement()
277
+ for ps in np.where(gene_names.values==gene)[0]:
278
+ guide_seq = Seq.Seq(X['30mer'][ps])
279
+ strand = X['Strand'][ps]
280
+ if strand=='sense':
281
+ guide_seq = guide_seq.reverse_complement()
282
+ #gene_seq = gene_seq.reverse_complement()
283
+ # figure out the sequence to the left and right of this guide, in the gene
284
+ ind = gene_seq.find(guide_seq)
285
+ if ind ==-1:
286
+ #gene_seq = gene_seq.reverse_complement()
287
+ #ind = gene_seq.find(guide_seq)
288
+ assert ind != -1, "could not find guide in gene"
289
+ assert gene_seq[ind:(ind+len(guide_seq))]==guide_seq, "match not right"
290
+ left_win = gene_seq[(ind - k_mer_length):ind]
291
+ right_win = gene_seq[(ind + len(guide_seq)):(ind + len(guide_seq) + k_mer_length)]
292
+
293
+ if strand=='antisense':
294
+ # it's arbitrary which of sense and anti-sense we flip, we just want
295
+ # to keep them in the same relative alphabet/direction
296
+ left_win = left_win.reverse_complement()
297
+ right_win = right_win.reverse_complement()
298
+ assert not left_win.tostring()=="", "k_mer_context, %s, is too large" % k_mer_length
299
+ assert not left_win.tostring()=="", "k_mer_context, %s, is too large" % k_mer_length
300
+ assert len(left_win)==len(right_win), "k_mer_context, %s, is too large" % k_mer_length
301
+ feat.ix[ps,"gene_left_win"] = left_win.tostring()
302
+ feat.ix[ps,"gene_right_win"] = right_win.tostring()
303
+ print("featurizing local context of %s" % (gene))
304
+
305
+ feature_sets = {}
306
+ get_all_order_nuc_features(feat["gene_left_win"], feature_sets, learn_options, learn_options["order"], max_index_to_use=sys.maxsize, prefix="gene_left_win")
307
+ get_all_order_nuc_features(feat["gene_right_win"], feature_sets, learn_options, learn_options["order"], max_index_to_use=sys.maxsize, prefix="gene_right_win")
308
+ return feature_sets
309
+
310
+ def gene_feature(Y, X, learn_options):
311
+ '''
312
+ Things like the sequence of the gene, the DNA Tm of the gene, etc.
313
+ '''
314
+
315
+ gene_names = Y['Target gene']
316
+
317
+ gene_length = np.zeros((gene_names.values.shape[0], 1))
318
+ gc_content = np.zeros((gene_names.shape[0], 1))
319
+ temperature = np.zeros((gene_names.shape[0], 1))
320
+ molecular_weight = np.zeros((gene_names.shape[0], 1))
321
+
322
+ for gene in gene_names.unique():
323
+ seq = util.get_gene_sequence(gene)
324
+ gene_length[gene_names.values==gene] = len(seq)
325
+ gc_content[gene_names.values==gene] = SeqUtil.GC(seq)
326
+ temperature[gene_names.values==gene] = Tm.Tm_NN(seq, nn_table=Tm.DNA_NN3)
327
+ molecular_weight[gene_names.values==gene] = SeqUtil.molecular_weight(seq, 'DNA')
328
+
329
+ all = np.concatenate((gene_length, gc_content, temperature, molecular_weight), axis=1)
330
+ df = pandas.DataFrame(data=all, index=gene_names.index, columns=['gene length',
331
+ 'gene GC content',
332
+ 'gene temperature',
333
+ 'gene molecular weight'])
334
+ return df
335
+
336
+ def gene_guide_feature(Y, X, learn_options):
337
+ #features, which are related to parts of the gene-local to the guide, and
338
+ #possibly incorporating the guide or interactions with it
339
+
340
+ #expensive, so pickle if necessary
341
+ gene_file = r"..\data\gene_seq_feat_V%s_km%s.ord%s.pickle" % (learn_options['V'], learn_options['include_gene_guide_feature'], learn_options['order'])
342
+
343
+ if False: #os.path.isfile(gene_file): #while debugging, comment out
344
+ print("loading local gene seq feats from file %s" % gene_file)
345
+ with open(gene_file, "rb") as f: feature_sets = pickle.load(f)
346
+ else:
347
+ feature_sets = local_gene_seq_features(Y['Target gene'], learn_options, X)
348
+ print("writing local gene seq feats to file %s" % gene_file)
349
+ with open(gene_file, "wb") as f: pickle.dump(feature_sets, f)
350
+
351
+ return feature_sets
352
+
353
+
354
+ def gc_cont(seq):
355
+ return (seq.count('G') + seq.count('C'))/float(len(seq))
356
+
357
+
358
+
359
+ def Tm_feature(data, pam_audit=True, learn_options=None):
360
+ '''
361
+ assuming '30-mer'is a key
362
+ get melting temperature features from:
363
+ 0-the 30-mer ("global Tm")
364
+ 1-the Tm (melting temperature) of the DNA:RNA hybrid from positions 16 - 20 of the sgRNA, i.e. the 5nts immediately proximal of the NGG PAM
365
+ 2-the Tm of the DNA:RNA hybrid from position 8 - 15 (i.e. 8 nt)
366
+ 3-the Tm of the DNA:RNA hybrid from position 3 - 7 (i.e. 5 nt)
367
+ '''
368
+
369
+ if learn_options is None or 'Tm segments' not in list(learn_options.keys()):
370
+ segments = [(19, 24), (11, 19), (6, 11)]
371
+ else:
372
+ segments = learn_options['Tm segments']
373
+
374
+ sequence = data['30mer'].values
375
+ featarray = np.ones((sequence.shape[0],4))
376
+
377
+ for i, seq in enumerate(sequence):
378
+ if pam_audit and seq[25:27]!="GG":
379
+ raise Exception("expected GG but found %s" % seq[25:27])
380
+ rna = False
381
+ featarray[i,0] = Tm.Tm_NN(seq, nn_table=Tm.DNA_NN3) #30mer Tm
382
+ featarray[i,1] = Tm.Tm_NN(seq[segments[0][0]:segments[0][1]], nn_table=Tm.DNA_NN3) #5nts immediately proximal of the NGG PAM
383
+ featarray[i,2] = Tm.Tm_NN(seq[segments[1][0]:segments[1][1]], nn_table=Tm.DNA_NN3) #8-mer
384
+ featarray[i,3] = Tm.Tm_NN(seq[segments[2][0]:segments[2][1]], nn_table=Tm.DNA_NN3) #5-mer
385
+
386
+ feat = pandas.DataFrame(featarray, index=data.index, columns=["Tm global_%s" % rna, "5mer_end_%s" %rna, "8mer_middle_%s" %rna, "5mer_start_%s" %rna])
387
+
388
+ return feat
389
+
390
+ def gc_features(data, audit=True):
391
+ gc_count = data['30mer'].apply(lambda seq: countGC(seq, audit))
392
+ gc_count.name = 'GC count'
393
+ gc_above_10 = (gc_count > 10)*1
394
+ gc_above_10.name = 'GC > 10'
395
+ gc_below_10 = (gc_count < 10)*1
396
+ gc_below_10.name = 'GC < 10'
397
+ return gc_above_10, gc_below_10, gc_count
398
+
399
+
400
+
401
+ def normalize_features(data,axis):
402
+ '''
403
+ input: Pandas.DataFrame of dtype=np.float64 array, of dimensions
404
+ mean-center, and unit variance each feature
405
+ '''
406
+ data -= data.mean(axis)
407
+ data /= data.std(axis)
408
+ # remove rows with NaNs
409
+ data = data.dropna(1)
410
+ if np.any(np.isnan(data.values)): raise Exception("found NaN in normalized features")
411
+ return data
412
+
413
+ def apply_nucleotide_features(seq_data_frame, order, num_proc, include_pos_independent, max_index_to_use, prefix=""):
414
+
415
+ fast = True
416
+ if include_pos_independent:
417
+ feat_pd = seq_data_frame.apply(nucleotide_features, args=(order, max_index_to_use, prefix, 'pos_dependent'))
418
+ feat_pi = seq_data_frame.apply(nucleotide_features, args=(order, max_index_to_use, prefix, 'pos_independent'))
419
+ assert not np.any(np.isnan(feat_pd)), "nans here can arise from sequences of different lengths"
420
+ assert not np.any(np.isnan(feat_pi)), "nans here can arise from sequences of different lengths"
421
+ return feat_pd, feat_pi
422
+ else:
423
+ feat_pd = seq_data_frame.apply(nucleotide_features, args=(order, max_index_to_use, prefix, 'pos_dependent'))
424
+ assert not np.any(np.isnan(feat_pd)), "found nan in feat_pd"
425
+ return feat_pd
426
+
427
+ def get_alphabet(order, raw_alphabet = ['A', 'T', 'C', 'G']):
428
+ alphabet = ["".join(i) for i in itertools.product(raw_alphabet, repeat=order)]
429
+ return alphabet
430
+
431
+ def nucleotide_features(s, order, max_index_to_use, prefix="", feature_type='all', raw_alphabet = ['A', 'T', 'C', 'G']):
432
+ '''
433
+ compute position-specific order-mer features for the 4-letter alphabet
434
+ (e.g. for a sequence of length 30, there are 30*4 single nucleotide features
435
+ and (30-1)*4^2=464 double nucleotide features
436
+ '''
437
+ assert feature_type in ['all', 'pos_independent', 'pos_dependent']
438
+ if max_index_to_use <= len(s):
439
+ #print "WARNING: trimming max_index_to use down to length of string=%s" % len(s)
440
+ max_index_to_use = len(s)
441
+
442
+ if max_index_to_use is not None:
443
+ s = s[:max_index_to_use]
444
+ #assert(len(s)==30, "length not 30")
445
+ #s = s[:30] #cut-off at thirty to clean up extra data that they accidentally left in, and were instructed to ignore in this way
446
+ alphabet = get_alphabet(order, raw_alphabet = raw_alphabet)
447
+ features_pos_dependent = np.zeros(len(alphabet)*(len(s)-(order-1)))
448
+ features_pos_independent = np.zeros(np.power(len(raw_alphabet),order))
449
+
450
+ index_dependent = []
451
+ index_independent = []
452
+
453
+ for position in range(0, len(s)-order+1, 1):
454
+ for l in alphabet:
455
+ index_dependent.append('%s%s_%d' % (prefix, l, position))
456
+
457
+ for l in alphabet:
458
+ index_independent.append('%s%s' % (prefix, l))
459
+
460
+ for position in range(0, len(s)-order+1, 1):
461
+ nucl = s[position:position+order]
462
+ features_pos_dependent[alphabet.index(nucl) + (position*len(alphabet))] = 1.0
463
+ features_pos_independent[alphabet.index(nucl)] += 1.0
464
+
465
+ # this is to check that the labels in the pd df actually match the nucl and position
466
+ assert index_dependent[alphabet.index(nucl) + (position*len(alphabet))] == '%s%s_%d' % (prefix, nucl, position)
467
+ assert index_independent[alphabet.index(nucl)] == '%s%s' % (prefix, nucl)
468
+
469
+
470
+ #index_independent = ['%s_pi.Order%d_P%d' % (prefix, order,i) for i in range(len(features_pos_independent))]
471
+ #index_dependent = ['%s_pd.Order%d_P%d' % (prefix, order, i) for i in range(len(features_pos_dependent))]
472
+
473
+
474
+ if np.any(np.isnan(features_pos_dependent)):
475
+ raise Exception("found nan features in features_pos_dependent")
476
+ if np.any(np.isnan(features_pos_independent)):
477
+ raise Exception("found nan features in features_pos_independent")
478
+
479
+ if feature_type == 'all' or feature_type == 'pos_independent':
480
+ if feature_type == 'all':
481
+ res = pandas.Series(features_pos_dependent,index=index_dependent), pandas.Series(features_pos_independent,index=index_independent)
482
+ assert not np.any(np.isnan(res.values))
483
+ return res
484
+ else:
485
+ res = pandas.Series(features_pos_independent, index=index_independent)
486
+ assert not np.any(np.isnan(res.values))
487
+ return res
488
+
489
+ res = pandas.Series(features_pos_dependent, index=index_dependent)
490
+ assert not np.any(np.isnan(res.values))
491
+ return res
492
+
493
+ def nucleotide_features_dictionary(prefix=''):
494
+ seqname = ['-4', '-3', '-2', '-1']
495
+ seqname.extend([str(i) for i in range(1,21)])
496
+ seqname.extend(['N', 'G', 'G', '+1', '+2', '+3'])
497
+
498
+ orders = [1, 2, 3]
499
+ sequence = 30
500
+ feature_names_dep = []
501
+ feature_names_indep = []
502
+ index_dependent = []
503
+ index_independent = []
504
+
505
+ for order in orders:
506
+ raw_alphabet = ['A', 'T', 'C', 'G']
507
+ alphabet = ["".join(i) for i in itertools.product(raw_alphabet, repeat=order)]
508
+ features_pos_dependent = np.zeros(len(alphabet)*(sequence-(order-1)))
509
+ features_pos_independent = np.zeros(np.power(len(raw_alphabet),order))
510
+
511
+ index_dependent.extend(['%s_pd.Order%d_P%d' % (prefix, order, i) for i in range(len(features_pos_dependent))])
512
+ index_independent.extend(['%s_pi.Order%d_P%d' % (prefix, order,i) for i in range(len(features_pos_independent))])
513
+
514
+ for pos in range(sequence-(order-1)):
515
+ for letter in alphabet:
516
+ feature_names_dep.append('%s_%s' % (letter, seqname[pos]))
517
+
518
+ for letter in alphabet:
519
+ feature_names_indep.append('%s' % letter)
520
+
521
+ assert len(feature_names_indep) == len(index_independent)
522
+ assert len(feature_names_dep) == len(index_dependent)
523
+
524
+ index_all = index_dependent + index_independent
525
+ feature_all = feature_names_dep + feature_names_indep
526
+
527
+ return dict(list(zip(index_all, feature_all)))
528
+
529
+ def normalize_feature_sets(feature_sets):
530
+ '''
531
+ zero-mean, unit-variance each feature within each set
532
+ '''
533
+
534
+ print("Normalizing features...")
535
+ t1 = time.time()
536
+
537
+ new_feature_sets = {}
538
+ for set in feature_sets:
539
+ new_feature_sets[set] = normalize_features(feature_sets[set],axis=0)
540
+ if np.any(np.isnan(new_feature_sets[set].values)):
541
+ raise Exception("found Nan feature values in set=%s" % set)
542
+ assert new_feature_sets[set].shape[1] > 0, "0 columns of features"
543
+ t2 = time.time()
544
+ print("\t\tElapsed time for normalizing features is %.2f seconds" % (t2-t1))
545
+
546
+ return new_feature_sets
src/utils/azimuth/load_data.py ADDED
@@ -0,0 +1,486 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas
2
+ import pkg_resources
3
+
4
+ from . import util
5
+ # import matplotlib.pyplot as plt
6
+ import scipy as sp
7
+ import scipy.stats
8
+ import numpy as np
9
+ import os
10
+
11
+ cur_dir = os.path.dirname(os.path.abspath(__file__))
12
+
13
+ def from_custom_file(data_file, learn_options):
14
+ # use semantics of when we load V2 data
15
+ print("Loading inputs to predict from %s" % data_file)
16
+ data = pandas.read_csv(data_file)
17
+
18
+ mandatory_columns = ['30mer', 'Target gene', 'Percent Peptide', 'Amino Acid Cut position']
19
+ for col in mandatory_columns:
20
+ assert col in data.columns, "inputs for prediction must include these columns: %s" % mandatory_columns
21
+
22
+ Xdf = pandas.DataFrame(data)
23
+ Xdf['30mercopy'] = Xdf['30mer']
24
+ Xdf = Xdf.set_index(['30mer', 'Target gene'])
25
+ Xdf['30mer'] = Xdf['30mercopy']
26
+ Xdf.index.names = ['Sequence', 'Target']
27
+ Xdf['drug']= ['dummydrug%s' % i for i in range(Xdf.shape[0])]
28
+ Xdf = Xdf.set_index('drug', append=True)
29
+
30
+ Y = None
31
+ gene_position = Xdf[['Percent Peptide', 'Amino Acid Cut position']]
32
+ target_genes = np.unique(Xdf.index.levels[1])
33
+
34
+ learn_options = set_V2_target_names(learn_options)
35
+
36
+ return Xdf, Y, gene_position, target_genes
37
+
38
+
39
+ def from_file(data_file, learn_options, data_file2=None, data_file3=None):
40
+ if learn_options["V"] == 1: # from Nature Biotech paper
41
+
42
+ print("loading V%d data" % learn_options["V"])
43
+
44
+ assert not learn_options["weighted"] is not None, "not supported for V1 data"
45
+ annotations, gene_position, target_genes, Xdf, Y = read_V1_data(data_file, learn_options)
46
+
47
+ learn_options['binary target name'] = 'average threshold'
48
+ learn_options['rank-transformed target name'] = 'average rank'
49
+ learn_options['raw target name'] = 'average activity'
50
+
51
+ # NF: not sure why the line below was uncommented
52
+ # gene_position, selected_ind, target_genes, Xdf, Y = extract_by_organism("mouse", Xdf, Y, gene_position)
53
+
54
+ elif learn_options["V"] == 2: # from Nov 2014, hot off the machines
55
+ Xdf, drugs_to_genes, target_genes, Y, gene_position = read_V2_data(data_file, learn_options)
56
+
57
+ # check that data is consistent with sgRNA score
58
+ xx = Xdf['sgRNA Score'].values
59
+ yy = Y['score_drug_gene_rank'].values
60
+ rr,pp = sp.stats.pearsonr(xx, yy)
61
+ assert rr > 0, "data processing has gone wrong as correlation with previous predictions is negative"
62
+
63
+ learn_options = set_V2_target_names(learn_options)
64
+
65
+ elif learn_options["V"] == 3: # merge of V1 and V2--this is what is used for the final model
66
+ # these are relative to the V2 data, and V1 will be made to automatically match
67
+ learn_options['binary target name'] = 'score_drug_gene_threshold'
68
+ learn_options['rank-transformed target name'] = 'score_drug_gene_rank'
69
+ learn_options['raw target name'] = None
70
+
71
+ Xdf, Y, gene_position, target_genes = mergeV1_V2(data_file, data_file2, learn_options)
72
+
73
+
74
+ elif learn_options["V"] == 4: # merge of V1 and V2 and the Xu et al data
75
+ # these are relative to the V2 data, and V1 and Xu et al. will be made to automatically match
76
+ learn_options['binary target name'] = 'score_drug_gene_threshold'
77
+ learn_options['rank-transformed target name'] = 'score_drug_gene_rank'
78
+ learn_options['raw target name'] = None
79
+
80
+ Xdf, Y, gene_position, target_genes = merge_all(data_file, data_file2, data_file3, learn_options)
81
+
82
+
83
+ elif learn_options['V'] == 5:
84
+ learn_options['binary target name'] = 'score_drug_gene_threshold'
85
+ learn_options['rank-transformed target name'] = 'score_drug_gene_rank'
86
+ learn_options['raw target name'] = None
87
+
88
+ gene_position, target_genes, Xdf, Y = read_xu_et_al(data_file3)
89
+
90
+
91
+ # truncate down to 30--some data sets gave us more.
92
+ Xdf["30mer"] = Xdf["30mer"].apply(lambda x: x[0:30])
93
+
94
+ return Xdf, Y, gene_position, target_genes
95
+
96
+
97
+ def set_V2_target_names(learn_options):
98
+ if 'binary target name' not in list(learn_options.keys()):
99
+ learn_options['binary target name'] = 'score_drug_gene_threshold'
100
+ if 'rank-transformed target name' not in list(learn_options.keys()):
101
+ learn_options['rank-transformed target name'] = 'score_drug_gene_rank'
102
+ learn_options['raw target name'] = 'score'
103
+ return learn_options
104
+
105
+
106
+ def combine_organisms(human_data, mouse_data):
107
+ # 'Target' is the column name, 'CD13' are some rows in that column
108
+ # xs slices through the pandas data frame to return another one
109
+ cd13 = human_data.xs('CD13', level='Target', drop_level=False)
110
+ # y_names are column names, cd13 is a pandas object
111
+ X_CD13, Y_CD13 = util.get_data(cd13, y_names=['NB4 CD13', 'TF1 CD13'])
112
+ cd33 = human_data.xs('CD33', level='Target', drop_level=False)
113
+ X_CD33, Y_CD33 = util.get_data(cd33, y_names=['MOLM13 CD33', 'TF1 CD33', 'NB4 CD33'])
114
+ cd15 = human_data.xs('CD15', level='Target', drop_level=False)
115
+ X_CD15, Y_CD15 = util.get_data(cd15, y_names=['MOLM13 CD15'])
116
+
117
+ mouse_X = pandas.DataFrame()
118
+ mouse_Y = pandas.DataFrame()
119
+ for k in mouse_data.index.levels[1]:
120
+ # is k the gene
121
+ X, Y = util.get_data(mouse_data.xs(k, level='Target', drop_level=False), ["On-target Gene"], target_gene=k, organism='mouse')
122
+ mouse_X = pandas.concat([mouse_X, X], axis=0)
123
+ mouse_Y = pandas.concat([mouse_Y, Y], axis=0)
124
+
125
+ X = pandas.concat([X_CD13, X_CD15, X_CD33, mouse_X], axis=0)
126
+ Y = pandas.concat([Y_CD13, Y_CD15, Y_CD33, mouse_Y], axis=0)
127
+
128
+ return X, Y
129
+
130
+
131
+ def read_V1_data(data_file, learn_options, AML_file=None):
132
+ if data_file is None:
133
+ data_file = pkg_resources.resource_filename(__name__, "data/V1_data.xlsx")
134
+ if AML_file is None:
135
+ AML_file = pkg_resources.resource_filename(__name__, "data/V1_suppl_data.txt")
136
+
137
+ human_data = pandas.read_excel(data_file, sheet_name=0, index_col=[0, 1])
138
+ mouse_data = pandas.read_excel(data_file, sheet_name=1, index_col=[0, 1])
139
+ Xdf, Y = combine_organisms(human_data, mouse_data)
140
+
141
+ # get position within each gene, then join and re-order
142
+ # note that 11 missing guides we were told to ignore
143
+ annotations = pandas.read_csv(AML_file, delimiter='\t', index_col=[0, 4])
144
+ annotations.index.names = Xdf.index.names
145
+ gene_position = pandas.merge(Xdf, annotations, how="inner", left_index=True, right_index=True)
146
+ gene_position = util.impute_gene_position(gene_position)
147
+ gene_position = gene_position[['Amino Acid Cut position', 'Nucleotide cut position', 'Percent Peptide']]
148
+ Y = Y.loc[gene_position.index]
149
+ Xdf = Xdf.loc[gene_position.index]
150
+
151
+ Y['test'] = 1 # for bookeeping to keep consistent with V2 which uses this for "extra pairs"
152
+
153
+ target_genes = Y['Target gene'].unique()
154
+
155
+ Y.index.names = ['Sequence', 'Target gene']
156
+
157
+ assert Xdf.index.equals(Y.index), "The index of Xdf is different from the index of Y (this can cause inconsistencies/random performance later on)"
158
+
159
+ if learn_options is not None and learn_options["flipV1target"]:
160
+ print("************************************************************************")
161
+ print("*****************MATCHING DOENCH CODE (DEBUG MODE)**********************")
162
+ print("************************************************************************")
163
+ # normally it is: Y['average threshold'] = Y['average rank'] > 0.8, where 1s are good guides, 0s are not
164
+ Y['average threshold'] = Y['average rank'] < 0.2 # 1s are bad guides
165
+ print("press c to continue")
166
+ import ipdb
167
+ ipdb.set_trace()
168
+
169
+ return annotations, gene_position, target_genes, Xdf, Y
170
+
171
+ def rank_transform(x):
172
+ return 1.0 - sp.stats.mstats.rankdata(x)/sp.stats.mstats.rankdata(x).max()
173
+
174
+ def read_xu_et_al(data_file, learn_options=None, verbose=True, subsetting='ours'):
175
+ if data_file is None:
176
+ data_file = '../data/xu_et_al_data.xlsx'
177
+
178
+ datasets = ['ribo', 'non_ribo', 'mESC']
179
+ aggregated = None
180
+
181
+ for d in datasets:
182
+ data_efficient = pandas.read_excel(data_file, sheet_name='%s_efficient_sgRNA' % d, skiprows=2)
183
+ data_inefficient = pandas.read_excel(data_file, sheet_name='%s_inefficient_sgRNA' % d, skiprows=2)
184
+
185
+ data_efficient['threshold'] = 1.
186
+ data_inefficient['threshold'] = 0.
187
+
188
+ exp_data = pandas.concat((data_efficient, data_inefficient))
189
+ exp_data['rank_KBM7'] = exp_data.groupby('Gene Symbol')['log2 fold change, KBM7'].transform(rank_transform)
190
+ exp_data['rank_HL60'] = exp_data.groupby('Gene Symbol')['log2 fold change, HL60'].transform(rank_transform)
191
+
192
+ if aggregated is None:
193
+ aggregated = exp_data
194
+ else:
195
+ aggregated = pandas.concat((aggregated, exp_data))
196
+
197
+
198
+ # go from 40mer to 30mer
199
+ if subsetting == 'ours':
200
+ aggregated["sequence(target+3'+5')"] = aggregated["sequence(target+3'+5')"].apply(lambda x: x[6:-4])
201
+ else:
202
+ aggregated["sequence(target+3'+5')"] = aggregated["sequence(target+3'+5')"].apply(lambda x: x[10:])
203
+
204
+ # make sure EVEYTHING is uppercase
205
+ aggregated["sequence(target+3'+5')"] = aggregated["sequence(target+3'+5')"].apply(lambda x: x.upper())
206
+
207
+ # rename columns
208
+ aggregated.rename(columns={"sequence(target+3'+5')": '30mer', 'Gene Symbol': 'Target gene', 'strand':'Strand'}, inplace=True)
209
+
210
+ aggregated['Strand'].loc[aggregated['Strand']=='+'] = 'sense'
211
+ aggregated['Strand'].loc[aggregated['Strand']=='-'] = 'antisense'
212
+
213
+ aggregated['average rank'] = aggregated[['rank_HL60', 'rank_KBM7']].mean(axis=1)
214
+ df = aggregated
215
+ df = df.rename(columns={'30mer': 'Sequence', 'Target gene': 'Target'})
216
+ df['drug'] = 'nodrug'
217
+ df['test'] = 1
218
+ df = df.set_index(['Sequence', 'Target', 'drug'])
219
+ df['30mer'] = df.index.get_level_values(0)
220
+ df['Target gene'] = df.index.get_level_values(1)
221
+ df['Organism'] = 'unknown'
222
+ df['score_drug_gene_rank'] = df['average rank']
223
+ df['score_drug_gene_threshold'] = df['threshold']
224
+ df['Nucleotide cut position'] = df['start of target']
225
+ df['Percent Peptide'] = 0
226
+ df['Amino Acid Cut position'] = 0
227
+ target_genes = np.unique(df['Target gene'].values)
228
+
229
+ return df[['Nucleotide cut position', 'Percent Peptide', 'Amino Acid Cut position']], target_genes, df[['30mer', 'Strand']], df[['score_drug_gene_rank', 'score_drug_gene_threshold', 'test', 'Target gene']]
230
+
231
+ def read_V2_data(data_file, learn_options=None, verbose=True):
232
+ if data_file is None:
233
+ data_file = pkg_resources.resource_filename(__name__, "data/V2_data.xlsx")
234
+
235
+ # to compare
236
+ # import predict as pr; a1, g1, t1, X1, Y1 = pr.data_setup()
237
+ # a1.index.names
238
+
239
+ data = pandas.read_excel(data_file, sheet_name="ResultsFiltered", skiprows=list(range(0, 6+1)), index_col=[0, 4])
240
+ # grab data relevant to each of three drugs, which exludes some genes
241
+ # note gene MED12 has two drugs, all others have at most one
242
+ Xdf = pandas.DataFrame()
243
+
244
+ # This comes from the "Pairs" tab in their excel sheet,
245
+ # note HPRT/HPRT1 are same thing, and also PLX_2uM/PLcX_2uM
246
+ known_pairs = {'AZD_200nM': ['CCDC101', 'MED12', 'TADA2B', 'TADA1'],
247
+ '6TG_2ug/mL': ['HPRT1'],
248
+ 'PLX_2uM': ['CUL3', 'NF1', 'NF2', 'MED12']}
249
+
250
+ drugs_to_genes = {'AZD_200nM': ['CCDC101', 'MED12', 'TADA2B', 'TADA1'],
251
+ '6TG_2ug/mL': ['HPRT1'],
252
+ 'PLX_2uM': ['CUL3', 'NF1', 'NF2', 'MED12']}
253
+
254
+ if learn_options is not None:
255
+ assert not (learn_options['extra pairs'] and learn_options['all pairs']), "extra pairs and all pairs options (in learn_options) can't be active simultaneously."
256
+
257
+ if learn_options['extra pairs']:
258
+ drugs_to_genes['AZD_200nM'].extend(['CUL3', 'NF1', 'NF2'])
259
+ elif learn_options['all pairs']:
260
+ drugs_to_genes['AZD_200nM'].extend(['HPRT1', 'CUL3', 'NF1', 'NF2'])
261
+ drugs_to_genes['PLX_2uM'].extend(['HPRT1', 'CCDC101', 'TADA2B', 'TADA1'])
262
+ drugs_to_genes['6TG_2ug/mL'].extend(['CCDC101', 'MED12', 'TADA2B', 'TADA1', 'CUL3', 'NF1', 'NF2'])
263
+
264
+ count = 0
265
+ for drug in list(drugs_to_genes.keys()):
266
+ genes = drugs_to_genes[drug]
267
+ for g in genes:
268
+ Xtmp = data.copy().xs(g, level='Target gene', drop_level=False)
269
+ Xtmp['drug'] = drug
270
+ Xtmp['score'] = Xtmp[drug].copy() # grab the drug results that are relevant for this gene
271
+
272
+ if g in known_pairs[drug]:
273
+ Xtmp['test'] = 1.
274
+ else:
275
+ Xtmp['test'] = 0.
276
+
277
+ count = count + Xtmp.shape[0]
278
+ Xdf = pandas.concat([Xdf, Xtmp], axis=0)
279
+ if verbose:
280
+ print("Loaded %d samples for gene %s \ttotal number of samples: %d" % (Xtmp.shape[0], g, count))
281
+
282
+ # create new index that includes the drug
283
+ Xdf = Xdf.set_index('drug', append=True)
284
+
285
+ Y = pandas.DataFrame(Xdf.pop("score"))
286
+ Y.columns.names = ["score"]
287
+
288
+ test_gene = pandas.DataFrame(Xdf.pop('test'))
289
+ target = pandas.DataFrame(Xdf.index.get_level_values('Target gene').values, index=Y.index, columns=["Target gene"])
290
+ Y = pandas.concat((Y, target, test_gene), axis=1)
291
+ target_genes = Y['Target gene'].unique()
292
+ gene_position = Xdf[["Percent Peptide", "Amino Acid Cut position"]].copy()
293
+
294
+ # convert to ranks for each (gene, drug combo)
295
+ # flip = True
296
+ y_rank = pandas.DataFrame()
297
+ y_threshold = pandas.DataFrame()
298
+ y_quant = pandas.DataFrame()
299
+ for drug in list(drugs_to_genes.keys()):
300
+ gene_list = drugs_to_genes[drug]
301
+ for gene in gene_list:
302
+ ytmp = pandas.DataFrame(Y.xs((gene, drug), level=["Target gene", "drug"], drop_level=False)['score'])
303
+ y_ranktmp, y_rank_raw, y_thresholdtmp, y_quanttmp = util.get_ranks(ytmp, thresh=0.8, prefix="score_drug_gene", flip=False)
304
+ # np.unique(y_rank.values-y_rank_raw.values)
305
+ y_rank = pandas.concat((y_rank, y_ranktmp), axis=0)
306
+ y_threshold = pandas.concat((y_threshold, y_thresholdtmp), axis=0)
307
+ y_quant = pandas.concat((y_quant, y_quanttmp), axis=0)
308
+
309
+ yall = pandas.concat((y_rank, y_threshold, y_quant), axis=1)
310
+ Y = pandas.merge(Y, yall, how='inner', left_index=True, right_index=True)
311
+
312
+ # convert also by drug only, irrespective of gene
313
+ y_rank = pandas.DataFrame()
314
+ y_threshold = pandas.DataFrame()
315
+ y_quant = pandas.DataFrame()
316
+ for drug in list(drugs_to_genes.keys()):
317
+ ytmp = pandas.DataFrame(Y.xs(drug, level="drug", drop_level=False)['score'])
318
+ y_ranktmp, y_rank_raw, y_thresholdtmp, y_quanttmp = util.get_ranks(ytmp, thresh=0.8, prefix="score_drug", flip=False)
319
+ # np.unique(y_rank.values-y_rank_raw.values)
320
+ y_rank = pandas.concat((y_rank, y_ranktmp), axis=0)
321
+ y_threshold = pandas.concat((y_threshold, y_thresholdtmp), axis=0)
322
+ y_quant = pandas.concat((y_quant, y_quanttmp), axis=0)
323
+
324
+ yall = pandas.concat((y_rank, y_threshold, y_quant), axis=1)
325
+ Y = pandas.merge(Y, yall, how='inner', left_index=True, right_index=True)
326
+
327
+ # PLOT = False
328
+ # if PLOT:
329
+ # # to better understand, try plotting something like:
330
+ # labels = ["score", "score_drug_gene_rank", "score_drug_rank", "score_drug_gene_threshold", "score_drug_threshold"]
331
+ #
332
+ # for label in labels:
333
+ # plt.figure()
334
+ # plt.plot(Xdf['sgRNA Score'].values, Y[label].values, '.')
335
+ # r, pearp = sp.stats.pearsonr(Xdf['sgRNA Score'].values.flatten(), Y[label].values.flatten())
336
+ # plt.title(label + ' VS pred. score, $r$=%0.2f (p=%0.2e)' % (r, pearp))
337
+ # plt.xlabel("sgRNA prediction score")
338
+ # plt.ylabel(label)
339
+
340
+ gene_position = util.impute_gene_position(gene_position)
341
+
342
+ if learn_options is not None and learn_options["weighted"] == "variance":
343
+ print("computing weights from replicate variance...")
344
+ # compute the variance across replicates so can use it as a weight
345
+ data = pandas.read_excel(data_file, sheet_name="Normalized", skiprows=list(range(0, 6+1)), index_col=[0, 4])
346
+ data.index.names = ["Sequence", "Target gene"]
347
+
348
+ experiments = {}
349
+ experiments['AZD_200nM'] = ['Deep 25', 'Deep 27', 'Deep 29 ', 'Deep 31']
350
+ experiments['6TG_2ug/mL'] = ['Deep 33', 'Deep 35', 'Deep 37', 'Deep 39']
351
+ experiments['PLX_2uM'] = ['Deep 49', 'Deep 51', 'Deep 53', 'Deep 55']
352
+
353
+ variance = None
354
+ for drug in list(drugs_to_genes.keys()):
355
+ data_tmp = data.iloc[data.index.get_level_values('Target gene').isin(drugs_to_genes[drug])][experiments[drug]]
356
+ data_tmp["drug"] = drug
357
+ data_tmp = data_tmp.set_index('drug', append=True)
358
+ data_tmp["variance"] = np.var(data_tmp.values, axis=1)
359
+ if variance is None:
360
+ variance = data_tmp["variance"].copy()
361
+ else:
362
+ variance = pandas.concat((variance, data_tmp["variance"]), axis=0)
363
+
364
+ orig_index = Y.index.copy()
365
+ Y = pandas.merge(Y, pandas.DataFrame(variance), how="inner", left_index=True, right_index=True)
366
+ Y = Y.ix[orig_index]
367
+ print("done.")
368
+
369
+ # Make sure to keep this check last in this function
370
+ assert Xdf.index.equals(Y.index), "The index of Xdf is different from the index of Y (this can cause inconsistencies/random performance later on)"
371
+
372
+ return Xdf, drugs_to_genes, target_genes, Y, gene_position
373
+
374
+
375
+ def merge_all(data_file=None, data_file2=None, data_file3=None, learn_options=None):
376
+ Xdf, Y, gene_position, target_genes = mergeV1_V2(data_file, data_file2, learn_options)
377
+ gene_position_xu, target_genes_xu, Xdf_xu, Y_xu = read_xu_et_al(data_file3, learn_options)
378
+ Xdf = pandas.concat((Xdf, Xdf_xu))
379
+ Y = pandas.concat((Y, Y_xu))
380
+ gene_position = pandas.concat((gene_position, gene_position_xu))
381
+ target_genes = np.concatenate((target_genes, target_genes_xu))
382
+
383
+ return Xdf, Y, gene_position, target_genes
384
+
385
+ def mergeV1_V2(data_file, data_file2, learn_options):
386
+ '''
387
+ ground_truth_label, etc. are taken to correspond to the V2 data, and then the V1 is appropriately matched
388
+ based on semantics
389
+ '''
390
+ assert not learn_options['include_strand'], "don't currently have 'Strand' column in V1 data"
391
+
392
+ annotations, gene_position1, target_genes1, Xdf1, Y1 = read_V1_data(data_file, learn_options)
393
+ Xdf2, drugs_to_genes, target_genes2, Y2, gene_position2 = read_V2_data(data_file2)
394
+
395
+ Y1.rename(columns={'average rank': learn_options["rank-transformed target name"]}, inplace=True)
396
+ Y1.rename(columns={'average threshold': learn_options["binary target name"]}, inplace=True)
397
+
398
+ # rename columns, and add a dummy "drug" to V1 so can join the data sets
399
+ Y1["drug"] = ["nodrug" for x in range(Y1.shape[0])]
400
+ Y1 = Y1.set_index('drug', append=True)
401
+ Y1.index.names = ['Sequence', 'Target gene', 'drug']
402
+
403
+ Y_cols_to_keep = np.unique(['Target gene', 'test', 'score_drug_gene_rank', 'score_drug_gene_threshold'])
404
+
405
+ Y1 = Y1[Y_cols_to_keep]
406
+ Y2 = Y2[Y_cols_to_keep]
407
+
408
+ Xdf1["drug"] = ["nodrug" for x in range(Xdf1.shape[0])]
409
+ Xdf1 = Xdf1.set_index('drug', append=True)
410
+
411
+ X_cols_to_keep = ['30mer', 'Strand']
412
+ Xdf1 = Xdf1[X_cols_to_keep]
413
+ Xdf2 = Xdf2[X_cols_to_keep]
414
+
415
+ gene_position1["drug"] = ["nodrug" for x in range(gene_position1.shape[0])]
416
+ gene_position1 = gene_position1.set_index('drug', append=True)
417
+ gene_position1.index.names = ['Sequence', 'Target gene', 'drug']
418
+ cols_to_keep = ['Percent Peptide', 'Amino Acid Cut position']
419
+ gene_position1 = gene_position1[cols_to_keep]
420
+ gene_position2 = gene_position2[cols_to_keep]
421
+
422
+ Y = pandas.concat((Y1, Y2), axis=0)
423
+ Xdf = pandas.concat((Xdf1, Xdf2), axis=0)
424
+ gene_position = pandas.concat((gene_position1, gene_position2))
425
+
426
+ # target_genes = target_genes1 + target_genes2
427
+ target_genes = np.concatenate((target_genes1, target_genes2))
428
+
429
+ save_to_file = False
430
+
431
+ if save_to_file:
432
+ Y.index.names = ['Sequence', 'Target', 'drug']
433
+ assert np.all(Xdf.index.values==Y.index.values), "rows don't match up"
434
+
435
+ onedupind = np.where(Y.index.duplicated())[0][0]
436
+ alldupind = np.where(Y.index.get_level_values(0).values==Y.index[onedupind][0])[0]
437
+
438
+ #arbitrarily set one of these to have "nodrug2" as the third level index
439
+ #so that they are not repeated, and the joints therefore do not augment the data set
440
+ assert len(alldupind)==2, "expected only duplicates"
441
+ newindex = Y.index.tolist()
442
+ newindex[onedupind] = (newindex[onedupind][0], newindex[onedupind][1], "nodrug2")
443
+ Y.index = pandas.MultiIndex.from_tuples(newindex, names = Y.index.names)
444
+ Xdf.index = pandas.MultiIndex.from_tuples(newindex, names = Y.index.names)
445
+
446
+ # there seems to be a duplicate index, and thus this increases the data set size, so doing it the hacky way...
447
+ XandY = pandas.merge(Xdf, Y, how="inner", left_index=True, right_index=True)
448
+ gene_position_tmp = gene_position.copy()
449
+ gene_position_tmp.index.names = ['Sequence', 'Target', 'drug']
450
+ gene_position_tmp.index = pandas.MultiIndex.from_tuples(newindex, names = Y.index.names)
451
+ XandY = pandas.merge(XandY, gene_position_tmp, how="inner", left_index=True, right_index=True)
452
+
453
+ # truncate to 30mers
454
+ XandY["30mer"] = XandY["30mer"].apply(lambda x: x[0:30])
455
+ XandY.to_csv(r'D:\Source\CRISPR\data\tmp\V3.csv')
456
+
457
+ return Xdf, Y, gene_position, target_genes
458
+
459
+
460
+ def get_V1_genes(data_file=None):
461
+ annotations, gene_position, target_genes, Xdf, Y = read_V1_data(data_file, learn_options=None)
462
+ return target_genes
463
+
464
+
465
+ def get_V2_genes(data_file=None):
466
+ Xdf, drugs_to_genes, target_genes, Y, gene_position = read_V2_data(data_file, verbose=False)
467
+ return target_genes
468
+
469
+
470
+ def get_V3_genes(data_fileV1=None, data_fileV2=None):
471
+ target_genes = np.concatenate((get_V1_genes(data_fileV1), get_V2_genes(data_fileV2)))
472
+ return target_genes
473
+
474
+ def get_xu_genes(data_file=None):
475
+ return read_xu_et_al(data_file)[1]
476
+
477
+ def get_mouse_genes(data_file=None):
478
+ annotations, gene_position, target_genes, Xdf, Y = read_V1_data(data_file, learn_options=None)
479
+ return Xdf[Xdf['Organism'] == 'mouse']['Target gene'].unique()
480
+
481
+
482
+ def get_human_genes(data_file=None):
483
+ annotations, gene_position, target_genes, Xdf, Y = read_V1_data(data_file, learn_options=None)
484
+ mouse_genes = Xdf[Xdf['Organism'] == 'mouse']['Target gene'].unique()
485
+ all_genes = get_V3_genes(None, None) # TODO this needs to support specifying file names (!= 'None')
486
+ return np.setdiff1d(all_genes, mouse_genes)
src/utils/azimuth/model_comparison.py ADDED
@@ -0,0 +1,716 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from importlib import resources
2
+
3
+ import azimuth.predict as pd
4
+ import copy
5
+ import os
6
+ import numpy as np
7
+ import azimuth.util
8
+ import shutil
9
+ import pickle
10
+ # import pylab as plt
11
+ import pandas
12
+ # import azimuth.local_multiprocessing
13
+ import azimuth.load_data
14
+ import azimuth.features.featurization as feat
15
+ import traceback
16
+
17
+
18
+ def check_feature_set_dims(feature_sets):
19
+ F2 = None
20
+ for set in list(feature_sets.keys()):
21
+ F = feature_sets[set].shape[0]
22
+ if F2 is None: F = F2
23
+ assert F == F2, "not same # individuals for feature %s" % set
24
+
25
+ assert feature_sets != {}, "features are empty, check learn_options"
26
+
27
+
28
+ def set_target(learn_options, classification):
29
+ assert 'target_name' not in list(learn_options.keys()) or learn_options[
30
+ 'target_name'] is not None, "changed it to be automatically set here"
31
+ if not classification:
32
+ learn_options["target_name"] = learn_options['rank-transformed target name']
33
+ learn_options["training_metric"] = 'spearmanr'
34
+ learn_options['ground_truth_label'] = learn_options['target_name']
35
+ else:
36
+ learn_options["target_name"] = learn_options['binary target name']
37
+ learn_options["training_metric"] = 'AUC'
38
+ learn_options['ground_truth_label'] = learn_options['binary target name']
39
+
40
+ if learn_options["V"] == 3:
41
+ assert learn_options['target_name'] == 'score_drug_gene_rank' or learn_options[
42
+ 'target_name'] == 'score_drug_gene_threshold', "cannot use raw scores when mergind data"
43
+ assert learn_options["ground_truth_label"] == 'score_drug_gene_rank' or learn_options[
44
+ "ground_truth_label"] == 'score_drug_gene_threshold', "cannot use raw scores when mergind data"
45
+
46
+ return learn_options
47
+
48
+
49
+ def GP_setup(learn_options, likelihood='gaussian', degree=3, set_target_fn=set_target):
50
+ learn_options["method"] = "GPy"
51
+ learn_options['kernel degree'] = degree
52
+
53
+ if likelihood == 'warped':
54
+ learn_options['warpedGP'] = True
55
+ else:
56
+ learn_options['warpedGP'] = False
57
+ learn_options = set_target_fn(learn_options, classification=False)
58
+
59
+ return learn_options
60
+
61
+
62
+ def SVC_setup(learn_options, likelihood='gaussian', degree=3, set_target_fn=set_target):
63
+ learn_options["method"] = "SVC"
64
+ learn_options = set_target_fn(learn_options, classification=True)
65
+
66
+ return learn_options
67
+
68
+
69
+ def L1_setup(learn_options, set_target_fn=set_target):
70
+ learn_options = set_target_fn(learn_options, classification=False)
71
+ learn_options["method"] = "linreg"
72
+ learn_options["penalty"] = "L1"
73
+ learn_options["feature_select"] = False
74
+ if "alpha" not in list(learn_options.keys()):
75
+ learn_options["alpha"] = np.array([1e-6 * pow(1.3, x) for x in range(0, 100)])
76
+ learn_options["loss"] = "squared"
77
+
78
+ return learn_options
79
+
80
+
81
+ def L2_setup(learn_options, set_target_fn=set_target):
82
+ learn_options = set_target_fn(learn_options, classification=False)
83
+ learn_options["method"] = "linreg"
84
+ learn_options["penalty"] = "L2"
85
+ learn_options["feature_select"] = False
86
+ if "alpha" not in list(learn_options.keys()):
87
+ learn_options["alpha"] = np.array([1e-6 * pow(1.3, x) for x in range(0, 100)])
88
+ learn_options["loss"] = "squared"
89
+
90
+ return learn_options
91
+
92
+
93
+ def mean_setup(learn_options, set_target_fn=set_target):
94
+ learn_options = set_target_fn(learn_options, classification=False)
95
+ learn_options['method'] = 'mean'
96
+ return learn_options
97
+
98
+
99
+ def random_setup(learn_options, set_target_fn=set_target):
100
+ learn_options = set_target_fn(learn_options, classification=False)
101
+ learn_options['method'] = 'random'
102
+ return learn_options
103
+
104
+
105
+ def elasticnet_setup(learn_options, set_target_fn=set_target):
106
+ learn_options = set_target_fn(learn_options, classification=False)
107
+ learn_options["method"] = "linreg"
108
+ learn_options["penalty"] = "EN"
109
+ learn_options["feature_select"] = False
110
+ learn_options["loss"] = "squared"
111
+ if "alpha" not in list(learn_options.keys()):
112
+ learn_options["alpha"] = np.array([1e-5 * pow(2, x) for x in range(0, 30)])
113
+ return learn_options
114
+
115
+
116
+ def DNN_setup(learn_options, set_target_fn=set_target):
117
+ learn_options = set_target_fn(learn_options, classification=False)
118
+ learn_options['method'] = 'DNN'
119
+ learn_options['DNN target variable'] = 'score' # 'score_drug_gene_quantized'
120
+ # learn_options['DNN architecture'] = (119, 10, 10, 10, 2)
121
+ return learn_options
122
+
123
+
124
+ def RF_setup(learn_options, set_target_fn=set_target):
125
+ learn_options = set_target_fn(learn_options, classification=False)
126
+ learn_options['method'] = 'RandomForestRegressor'
127
+ return learn_options
128
+
129
+
130
+ def doench_setup(learn_options, set_target_fn=set_target):
131
+ learn_options = set_target_fn(learn_options, classification=True)
132
+ learn_options['method'] = 'doench'
133
+ return learn_options
134
+
135
+
136
+ def sgrna_from_doench_setup(learn_options, set_target_fn=set_target):
137
+ learn_options = set_target_fn(learn_options, classification=False)
138
+ learn_options['method'] = 'sgrna_from_doench'
139
+ return learn_options
140
+
141
+
142
+ def linreg_setup(learn_options, set_target_fn=set_target):
143
+ learn_options["method"] = "linreg"
144
+ learn_options["penalty"] = "L1"
145
+ learn_options["feature_select"] = False
146
+ if "alpha" not in list(learn_options.keys()):
147
+ learn_options["alpha"] = np.array([0.0])
148
+ learn_options["loss"] = "squared"
149
+ learn_options = set_target_fn(learn_options, classification=False)
150
+
151
+ return learn_options
152
+
153
+
154
+ def logregL1_setup(learn_options, set_target_fn=set_target):
155
+ learn_options = set_target_fn(learn_options, classification=True)
156
+ learn_options["method"] = "logregL1"
157
+ learn_options["penalty"] = "L1"
158
+ learn_options["feature_select"] = False
159
+ if "alpha" not in list(learn_options.keys()):
160
+ learn_options["alpha"] = np.array([1e-6 * pow(1.3, x) for x in range(0, 100)])
161
+ if "fit_intercept" not in learn_options:
162
+ learn_options["fit_intercept"] = True
163
+ return learn_options
164
+
165
+
166
+ def LASSOs_ensemble_setup(learn_options, set_target_fn=set_target):
167
+ learn_options = set_target_fn(learn_options, classification=False)
168
+ learn_options["method"] = "lasso_ensemble"
169
+ learn_options["penalty"] = "L1"
170
+ learn_options["feature_select"] = False
171
+ if "alpha" not in list(learn_options.keys()):
172
+ learn_options["alpha"] = np.array([1e-6 * pow(1.3, x) for x in range(0, 100)])
173
+ learn_options["loss"] = "squared"
174
+
175
+ return learn_options
176
+
177
+
178
+ def xu_et_al_setup(learn_options, set_target_fn=set_target):
179
+ learn_options = set_target_fn(learn_options, classification=True)
180
+ learn_options["method"] = "xu_et_al"
181
+
182
+ return learn_options
183
+
184
+
185
+ def adaboost_setup(learn_options, num_estimators=100, max_depth=3, learning_rate=0.1, set_target_fn=set_target,
186
+ model="AdaBoost"):
187
+ """
188
+ """
189
+ learn_options = set_target_fn(learn_options, classification=False)
190
+ if model == "AdaBoost":
191
+ learn_options['method'] = "AdaBoostRegressor"
192
+ elif model == "AdaBoostClassifier":
193
+ learn_options['method'] = "AdaBoostClassifier"
194
+ else:
195
+ raise Exception("model must be either AdaBoost or AdaBoost Classifier")
196
+ learn_options['adaboost_version'] = 'python' # "R" or "python"
197
+
198
+ if 'adaboost_loss' not in list(learn_options.keys()) and model == "AdaBoostRegressor":
199
+ learn_options['adaboost_loss'] = 'squared_error'
200
+ elif model == "AdaBoostRegressor":
201
+ # Ensure that the loss is always set to a valid value for the regressor
202
+ learn_options['adaboost_loss'] = 'squared_error'
203
+
204
+ if 'adaboost_alpha' not in list(learn_options.keys()):
205
+ learn_options['adaboost_alpha'] = 0.5 # this parameter is only used by the huber and quantile loss functions.
206
+
207
+ if not learn_options['adaboost_CV']:
208
+ learn_options['adaboost_learning_rate'] = learning_rate
209
+ learn_options['adaboost_n_estimators'] = num_estimators
210
+ learn_options['adaboost_max_depth'] = max_depth
211
+ else:
212
+ learn_options['adaboost_n_estimators'] = num_estimators
213
+
214
+ return learn_options
215
+
216
+
217
+ def shared_setup(learn_options, order, test):
218
+ if 'num_proc' not in list(learn_options.keys()):
219
+ learn_options['num_proc'] = None
220
+ if 'num_thread_per_proc' not in list(learn_options.keys()):
221
+ learn_options['num_thread_per_proc'] = None
222
+
223
+ num_proc = azimuth.local_multiprocessing.configure(TEST=test, num_proc=learn_options["num_proc"],
224
+ num_thread_per_proc=learn_options["num_thread_per_proc"])
225
+ learn_options["num_proc"] = num_proc
226
+
227
+ learn_options["order"] = order # gets used many places in code, not just here
228
+
229
+ if "cv" not in list(learn_options.keys()):
230
+ # if no CV preference is specified, use leave-one-gene-out
231
+ learn_options["cv"] = "gene"
232
+
233
+ if "normalize_features" not in list(learn_options.keys()):
234
+ # if no CV preference is specified, use leave-one-gene-out
235
+ learn_options["normalize_features"] = True
236
+
237
+ if "weighted" not in list(learn_options.keys()):
238
+ learn_options['weighted'] = None
239
+
240
+ if "all pairs" not in list(learn_options.keys()):
241
+ learn_options["all pairs"] = False
242
+
243
+ if "include_known_pairs" not in list(learn_options.keys()):
244
+ learn_options["include_known_pairs"] = False
245
+
246
+ if "include_gene_guide_feature" not in list(learn_options.keys()):
247
+ learn_options["include_gene_guide_feature"] = 0 # used as window size, so 0 is none
248
+
249
+ # these should default to true to match experiments before they were options:
250
+ if "gc_features" not in list(learn_options.keys()):
251
+ learn_options["gc_features"] = True
252
+ if "nuc_features" not in list(learn_options.keys()):
253
+ learn_options["nuc_features"] = True
254
+
255
+ if 'train_genes' not in list(learn_options.keys()):
256
+ learn_options["train_genes"] = None
257
+ if 'test_genes' not in list(learn_options.keys()):
258
+ learn_options["test_genes"] = None
259
+
260
+ if "num_proc" not in learn_options:
261
+ learn_options["num_proc"] = None
262
+ if "num_thread_per_proc" not in learn_options:
263
+ learn_options["num_thread_per_proc"] = None
264
+
265
+ if 'seed' not in learn_options:
266
+ learn_options['seed'] = 1
267
+
268
+ if "flipV1target" not in learn_options:
269
+ learn_options["flipV1target"] = False
270
+
271
+ if 'num_genes_remove_train' not in learn_options:
272
+ learn_options['num_genes_remove_train'] = None
273
+
274
+ if "include_microhomology" not in learn_options:
275
+ learn_options["include_microhomology"] = False
276
+
277
+ if "algorithm_hyperparam_search" not in learn_options:
278
+ learn_options["algorithm_hyperparam_search"] = "grid" # other options is bo for bayesian optimization
279
+
280
+ return num_proc
281
+
282
+
283
+ def setup(test=False, order=1, learn_options=None, data_file=None, pam_audit=True, length_audit=True):
284
+ num_proc = shared_setup(learn_options, order, test)
285
+
286
+ assert "testing_non_binary_target_name" in list(
287
+ learn_options.keys()), "need this in order to get metrics, though used to be not needed, so you may newly see this error"
288
+ if learn_options["testing_non_binary_target_name"] not in ['ranks', 'raw', 'thrs']:
289
+ raise Exception('learn_otions["testing_non_binary_target_name"] must be in ["ranks", "raw", "thrs"]')
290
+
291
+ Xdf, Y, gene_position, target_genes = azimuth.load_data.from_file(data_file, learn_options)
292
+ learn_options['all_genes'] = target_genes
293
+
294
+ if test:
295
+ learn_options["order"] = 1
296
+
297
+ if 'convert_30mer_to_31mer' in learn_options and learn_options['convert_30mer_to_31mer'] is True:
298
+ print(
299
+ "WARNING!!! converting 30 mer to 31 mer (and then cutting off first nucleotide to go back to 30mer with a right shift)")
300
+ for i in range(Xdf.shape[0]):
301
+ Xdf['30mer'].iloc[i] = azimuth.util.convert_to_thirty_one(Xdf.iloc[i]["30mer"], Xdf.index.values[i][1],
302
+ Xdf.iloc[i]["Strand"])
303
+ # to_keep = Xdf['30mer'].isnull() == False
304
+ # Xdf = Xdf[to_keep]
305
+ # gene_position = gene_position[to_keep]
306
+ # Y = Y[to_keep]
307
+ Xdf["30mer"] = Xdf["30mer"].apply(lambda x: x[1:]) # chop the first nucleotide
308
+
309
+ if 'left_right_guide_ind' in learn_options and learn_options['left_right_guide_ind'] is not None:
310
+ seq_start, seq_end, expected_length = learn_options['left_right_guide_ind']
311
+ assert len(Xdf["30mer"].values[0]) == expected_length
312
+ Xdf['30mer'] = Xdf['30mer'].apply(lambda seq: seq[seq_start:seq_end])
313
+
314
+ feature_sets = feat.featurize_data(Xdf, learn_options, Y, gene_position, pam_audit=pam_audit,
315
+ length_audit=length_audit)
316
+ np.random.seed(learn_options['seed'])
317
+
318
+ return Y, feature_sets, target_genes, learn_options, num_proc
319
+
320
+
321
+ def run_models(models, orders, GP_likelihoods=['gaussian', 'warped'], WD_kernel_degrees=[3],
322
+ adaboost_learning_rates=[0.1], adaboost_num_estimators=[100], adaboost_max_depths=[3],
323
+ learn_options_set=None, test=False, CV=True, setup_function=setup, set_target_fn=set_target,
324
+ pam_audit=True, length_audit=True, return_data=False):
325
+ '''
326
+ CV is set to false if want to train a final model and not cross-validate, but it goes in to what
327
+ looks like cv code
328
+ '''
329
+
330
+ results = {}
331
+ assert learn_options_set is not None, "need to specify learn_options_set"
332
+ all_learn_options = {}
333
+
334
+ # shorten so easier to display on graphs
335
+ feat_models_short = {'L1': "L1", 'L2': "L2", 'elasticnet': "EN", 'linreg': "LR",
336
+ 'RandomForest': "RF",
337
+ 'AdaBoost': "AB", 'AdaBoostClassifier': "ABClass", 'doench': 'doench',
338
+ "logregL1": "logregL1", "sgrna_from_doench": "sgrna_from_doench", 'SVC': 'SVC',
339
+ 'xu_et_al': 'xu_et_al'}
340
+
341
+ if not CV:
342
+ print("Received option CV=False, so I'm training using all of the data")
343
+ assert len(list(learn_options_set.keys())) == 1, "when CV is False, only 1 set of learn options is allowed"
344
+ assert len(models) == 1, "when CV is False, only 1 model is allowed"
345
+
346
+ for learn_options_str in list(learn_options_set.keys()):
347
+ # these options get augmented in setup
348
+ partial_learn_opt = learn_options_set[learn_options_str]
349
+ # if the model requires encoded features
350
+ for model in models:
351
+ # models requiring explicit featurization
352
+ if model in list(feat_models_short.keys()):
353
+ for order in orders:
354
+ print("running %s, order %d for %s" % (model, order, learn_options_str))
355
+
356
+ Y, feature_sets, target_genes, learn_options, num_proc = setup_function(test=test, order=order,
357
+ learn_options=partial_learn_opt,
358
+ pam_audit=pam_audit,
359
+ length_audit=length_audit) # TODO precompute features for all orders, as this is repated for each model
360
+
361
+ if model == 'L1':
362
+ learn_options_model = L1_setup(copy.deepcopy(learn_options), set_target_fn=set_target_fn)
363
+ elif model == 'L2':
364
+ learn_options_model = L2_setup(copy.deepcopy(learn_options), set_target_fn=set_target_fn)
365
+ elif model == 'elasticnet':
366
+ learn_options_model = elasticnet_setup(copy.deepcopy(learn_options),
367
+ set_target_fn=set_target_fn)
368
+ elif model == 'linreg':
369
+ learn_options_model = linreg_setup(copy.deepcopy(learn_options), set_target_fn=set_target_fn)
370
+ elif model == "logregL1":
371
+ learn_options_model = logregL1_setup(copy.deepcopy(learn_options), set_target_fn=set_target_fn)
372
+ elif model == 'RandomForest':
373
+ learn_options_model = RF_setup(copy.deepcopy(learn_options), set_target_fn=set_target_fn)
374
+ elif model == 'SVC':
375
+ learn_options_model = SVC_setup(copy.deepcopy(learn_options), set_target_fn=set_target_fn)
376
+ elif model == 'doench':
377
+ learn_options_model = doench_setup(copy.deepcopy(learn_options), set_target_fn=set_target_fn)
378
+ elif model == 'sgrna_from_doench':
379
+ learn_options_model = sgrna_from_doench_setup(copy.deepcopy(learn_options),
380
+ set_target_fn=set_target_fn)
381
+ elif model == 'xu_et_al':
382
+ learn_options_model = xu_et_al_setup(copy.deepcopy(learn_options), set_target_fn=set_target_fn)
383
+ elif model == 'AdaBoost' or 'AdaBoostClassifier':
384
+ for learning_rate in adaboost_learning_rates:
385
+ for num_estimators in adaboost_num_estimators:
386
+ for max_depth in adaboost_max_depths:
387
+ learn_options_model = adaboost_setup(copy.deepcopy(learn_options),
388
+ learning_rate=learning_rate,
389
+ num_estimators=num_estimators,
390
+ max_depth=max_depth,
391
+ set_target_fn=set_target_fn, model=model)
392
+ model_string = feat_models_short[model] + '_or%d_md%d_lr%.2f_n%d_%s' % (
393
+ learn_options_set[learn_options_str]["order"], max_depth, learning_rate, num_estimators,
394
+ learn_options_str)
395
+ if model != 'AdaBoost':
396
+ model_string = feat_models_short[model] + '_ord%d_%s' % (
397
+ learn_options_set[learn_options_str]["order"], learn_options_str)
398
+
399
+ results[model_string] = pd.cross_validate(Y, feature_sets, learn_options=learn_options_model,
400
+ TEST=test, CV=CV)
401
+
402
+ all_learn_options[model_string] = learn_options_model
403
+ # if the model doesn't require explicit featurization
404
+ else:
405
+ assert setup_fn == setup, "not yet modified to handle this"
406
+ print("running %s for %s" % (model, learn_options_str))
407
+ Y, feature_sets, target_genes, learn_options, num_proc = setup(test=test, order=1,
408
+ learn_options=partial_learn_opt,
409
+ pam_audit=pam_audit,
410
+ length_audit=length_audit)
411
+ if model == 'mean':
412
+ learn_options_model = mean_setup(copy.deepcopy(learn_options))
413
+ elif model == 'random':
414
+ learn_options_model = random_setup(copy.deepcopy(learn_options))
415
+ elif model == 'DNN':
416
+ learn_options_model = DNN_setup(copy.deepcopy(learn_options))
417
+ elif model == 'GP':
418
+ for likelihood in GP_likelihoods:
419
+ for degree in WD_kernel_degrees:
420
+ learn_options_model = GP_setup(copy.deepcopy(learn_options), likelihood=likelihood,
421
+ degree=degree)
422
+ model_string = '%s_%s_degree%d_%s' % (model, likelihood, degree, learn_options_str)
423
+ results[model_string] = pd.cross_validate(Y, feature_sets,
424
+ learn_options=learn_options_model, TEST=test,
425
+ CV=CV)
426
+
427
+ else:
428
+ raise NotImplementedError("model %s not supported" % model)
429
+
430
+ # "GP" already calls pd.cross_validate() and has its own model_string, so skip this.
431
+ if model != "GP":
432
+ model_string = model + '_%s' % learn_options_str
433
+ results[model_string] = pd.cross_validate(Y, feature_sets, learn_options=learn_options_model,
434
+ TEST=test, CV=CV)
435
+
436
+ all_learn_options[model_string] = learn_options_model
437
+
438
+ return results, all_learn_options
439
+
440
+
441
+ def pickle_runner_results(exp_name, results, all_learn_options, relpath="/../" + "results"):
442
+ abspath = os.path.abspath(__file__)
443
+ dname = os.path.dirname(abspath) + relpath
444
+ if not os.path.exists(dname):
445
+ os.makedirs(dname)
446
+ print("Created directory: %s" % str(dname))
447
+ if exp_name is None:
448
+ exp_name = list(results.keys())[0]
449
+ myfile = dname + '/' + exp_name + '.pickle'
450
+ with open(myfile, 'wb') as f:
451
+ print("writing results to %s" % myfile)
452
+ pickle.dump((results, all_learn_options), f, -1)
453
+
454
+
455
+ def runner(models, learn_options, GP_likelihoods=None, orders=None, WD_kernel_degrees=None, where='local',
456
+ cluster_user='fusi', cluster='RR1-N13-09-H44', test=False, exp_name=None, **kwargs):
457
+ if where == 'local':
458
+ results, all_learn_options = run_models(models, orders=orders, GP_likelihoods=GP_likelihoods,
459
+ learn_options_set=learn_options, WD_kernel_degrees=WD_kernel_degrees,
460
+ test=test, **kwargs)
461
+ all_metrics, gene_names = azimuth.util.get_all_metrics(results, learn_options)
462
+ azimuth.util.plot_all_metrics(all_metrics, gene_names, all_learn_options, save=True)
463
+
464
+ # for non-local (i.e. cluster), the comparable code is in cli_run_model.py
465
+ pickle_runner_results(exp_name, results, all_learn_options)
466
+
467
+ return results, all_learn_options, all_metrics, gene_names
468
+
469
+ elif where == 'cluster':
470
+ from . import cluster_job
471
+
472
+ # create random cluster directory, dump learn options, and create cluster file
473
+ tempdir, user, clust_filename = cluster_job.create(cluster_user, models, orders, WD_kernel_degrees,
474
+ GP_likelihoods, exp_name=exp_name,
475
+ learn_options=learn_options, **kwargs)
476
+
477
+ # raw_input("Submit job to HPC and press any key when it's finished: ")
478
+ # util.plot_cluster_results(directory=tempdir)
479
+
480
+ # stdout = tempdir + r"/stdout"
481
+ # stderr = tempdir + r"/stderr"
482
+ # if not os.path.exists(stdout): os.makedirs(stdout)
483
+ # if not os.path.exists(stderr): os.makedirs(stderr)
484
+
485
+ return tempdir, clust_filename, user # , stdout, stderr
486
+
487
+
488
+ def save_final_model_V3(filename=None, include_position=True, learn_options=None, short_name='final', pam_audit=True,
489
+ length_audit=True):
490
+ '''
491
+ run_models(produce_final_model=True) is what saves the model
492
+ '''
493
+ test = False
494
+ assert filename is not None, "need to provide filename to save final model"
495
+
496
+ if learn_options is None:
497
+ if include_position:
498
+ learn_options = {"V": 3,
499
+ 'train_genes': azimuth.load_data.get_V3_genes(),
500
+ 'test_genes': azimuth.load_data.get_V3_genes(),
501
+ "testing_non_binary_target_name": 'ranks',
502
+ 'include_pi_nuc_feat': True,
503
+ "gc_features": True,
504
+ "nuc_features": True,
505
+ "include_gene_position": True,
506
+ "include_NGGX_interaction": True,
507
+ "include_Tm": True,
508
+ "include_strand": False,
509
+ "include_gene_feature": False,
510
+ "include_gene_guide_feature": 0,
511
+ "extra pairs": False,
512
+ "weighted": None,
513
+ "training_metric": 'spearmanr',
514
+ "NDGC_k": 10,
515
+ "cv": "gene",
516
+ "include_gene_effect": False,
517
+ "include_drug": False,
518
+ "include_sgRNAscore": False,
519
+ 'adaboost_loss': 'squared_error',
520
+ # main "ls", alternatives: "lad", "huber", "quantile", see scikit docs for details
521
+ 'adaboost_alpha': 0.5,
522
+ # this parameter is only used by the huber and quantile loss functions.
523
+ 'normalize_features': False,
524
+ 'adaboost_CV': False
525
+ }
526
+ else:
527
+ learn_options = {"V": 3,
528
+ 'train_genes': azimuth.load_data.get_V3_genes(),
529
+ 'test_genes': azimuth.load_data.get_V3_genes(),
530
+ "testing_non_binary_target_name": 'ranks',
531
+ 'include_pi_nuc_feat': True,
532
+ "gc_features": True,
533
+ "nuc_features": True,
534
+ "include_gene_position": False,
535
+ "include_NGGX_interaction": True,
536
+ "include_Tm": True,
537
+ "include_strand": False,
538
+ "include_gene_feature": False,
539
+ "include_gene_guide_feature": 0,
540
+ "extra pairs": False,
541
+ "weighted": None,
542
+ "training_metric": 'spearmanr',
543
+ "NDGC_k": 10,
544
+ "cv": "gene",
545
+ "include_gene_effect": False,
546
+ "include_drug": False,
547
+ "include_sgRNAscore": False,
548
+ 'adaboost_loss': 'squared_error',
549
+ # main "ls", alternatives: "lad", "huber", "quantile", see scikit docs for details
550
+ 'adaboost_alpha': 0.5,
551
+ # this parameter is only used by the huber and quantile loss functions.
552
+ 'normalize_features': False,
553
+ 'adaboost_CV': False
554
+ }
555
+
556
+ learn_options_set = {short_name: learn_options}
557
+ results, all_learn_options = run_models(["AdaBoost"], orders=[2], adaboost_learning_rates=[0.1],
558
+ adaboost_max_depths=[3], adaboost_num_estimators=[100],
559
+ learn_options_set=learn_options_set,
560
+ test=test, CV=False, pam_audit=length_audit, length_audit=length_audit)
561
+
562
+ print(f"Results: {results}")
563
+ print(f"All learn options: {all_learn_options}")
564
+
565
+ model = list(results.values())[0][3][0]
566
+
567
+ try:
568
+ with open(filename, 'wb') as f:
569
+ pickle.dump((model, learn_options), f, protocol=4)
570
+ print(f"Model saved successfully to {filename}")
571
+ except Exception as e:
572
+ print(f"Error saving model to {filename}: {str(e)}")
573
+ traceback.print_exc()
574
+
575
+ return model
576
+
577
+
578
+ def predict(seq, aa_cut=None, percent_peptide=None, model=None, model_file=None, pam_audit=True, length_audit=False,
579
+ learn_options_override=None):
580
+ """
581
+ Args:
582
+ seq: numpy array of 30 nt sequences.
583
+ aa_cut: numpy array of amino acid cut positions (optional).
584
+ percent_peptide: numpy array of percent peptide (optional).
585
+ model: model instance to use for prediction (optional).
586
+ model_file: file name of pickled model to use for prediction (optional).
587
+ pam_audit: check PAM of each sequence.
588
+ length_audit: check length of each sequence.
589
+ learn_options_override: a dictionary indicating which learn_options to override (optional).
590
+
591
+ Returns: a numpy array of predictions.
592
+ """
593
+ # assert not (model is None and model_file is None), "you have to specify either a model or a model_file"
594
+ assert isinstance(seq, (np.ndarray)), "Please ensure seq is a numpy array"
595
+ assert len(seq[0]) > 0, "Make sure that seq is not empty"
596
+ assert isinstance(seq[0],
597
+ str), "Please ensure input sequences are in string format, i.e. 'AGAG' rather than ['A' 'G' 'A' 'G'] or alternate representations"
598
+
599
+ if aa_cut is not None:
600
+ assert len(aa_cut) > 0, "Make sure that aa_cut is not empty"
601
+ assert isinstance(aa_cut, (np.ndarray)), "Please ensure aa_cut is a numpy array"
602
+ assert np.all(np.isreal(aa_cut)), "amino-acid cut position needs to be a real number"
603
+
604
+ if percent_peptide is not None:
605
+ assert len(percent_peptide) > 0, "Make sure that percent_peptide is not empty"
606
+ assert isinstance(percent_peptide, (np.ndarray)), "Please ensure percent_peptide is a numpy array"
607
+ assert np.all(np.isreal(percent_peptide)), "percent_peptide needs to be a real number"
608
+
609
+ if model_file is None:
610
+ if np.any(percent_peptide == -1) or (percent_peptide is None and aa_cut is None):
611
+ print("No model file specified, using V3_model_nopos")
612
+ model_name = 'V3_model_nopos.pickle'
613
+ else:
614
+ print("No model file specified, using V3_model_full")
615
+ model_name = 'V3_model_full.pickle'
616
+
617
+ model_file = os.path.join(os.path.dirname(__file__), 'saved_models', model_name)
618
+ print(f"Looking for model file: {model_file}")
619
+
620
+ if not os.path.exists(model_file):
621
+ print(f"Model file not found: {model_file}")
622
+ print("Please run 'python azimuth/model_comparison.py' to generate the model files.")
623
+ print("After generating the models, move them to a 'saved_models' directory in your project root.")
624
+ raise FileNotFoundError(f"Model file not found: {model_file}")
625
+
626
+ # Use 'with' statement to properly open and close the file
627
+ with open(model_file, 'rb') as f:
628
+ model = pickle.load(f, encoding='bytes')
629
+
630
+ if model is None:
631
+ with open(model_file, 'rb') as f:
632
+ model, learn_options = pickle.load(f, encoding='bytes')
633
+ else:
634
+ model, learn_options = model
635
+
636
+ learn_options["V"] = 2
637
+
638
+ learn_options = override_learn_options(learn_options_override, learn_options)
639
+
640
+ # Y, feature_sets, target_genes, learn_options, num_proc = setup(test=False, order=2, learn_options=learn_options, data_file=test_filename)
641
+ # inputs, dim, dimsum, feature_names = pd.concatenate_feature_sets(feature_sets)
642
+
643
+ Xdf = pandas.DataFrame(columns=['30mer', 'Strand'], data=list(zip(seq, ['NA' for x in range(len(seq))])))
644
+
645
+ if np.all(percent_peptide != -1) and (percent_peptide is not None and aa_cut is not None):
646
+ gene_position = pandas.DataFrame(columns=['Percent Peptide', 'Amino Acid Cut position'],
647
+ data=list(zip(percent_peptide, aa_cut)))
648
+ else:
649
+ gene_position = pandas.DataFrame(columns=['Percent Peptide', 'Amino Acid Cut position'],
650
+ data=list(zip(np.ones(seq.shape[0]) * -1, np.ones(seq.shape[0]) * -1)))
651
+
652
+ feature_sets = feat.featurize_data(Xdf, learn_options, pandas.DataFrame(), gene_position, pam_audit=pam_audit,
653
+ length_audit=length_audit)
654
+ inputs, dim, dimsum, feature_names = azimuth.util.concatenate_feature_sets(feature_sets)
655
+
656
+ # print "CRISPR"
657
+ # pandas.DataFrame(inputs).to_csv("CRISPR.inputs.test.csv")
658
+ # import ipdb; ipdb.set_trace()
659
+
660
+ # call to scikit-learn, returns a vector of predicted values
661
+ preds = model.predict(inputs)
662
+
663
+ # also check that predictions are not 0/1 from a classifier.predict() (instead of predict_proba() or decision_function())
664
+ unique_preds = np.unique(preds)
665
+ ok = False
666
+ for pr in preds:
667
+ if pr not in [0, 1]:
668
+ ok = True
669
+ assert ok, "model returned only 0s and 1s"
670
+ return preds
671
+
672
+
673
+ def override_learn_options(learn_options_override, learn_options):
674
+ """
675
+ override all keys seen in learn_options_override to alter learn_options
676
+ """
677
+ if learn_options_override is not None:
678
+ for k in list(learn_options_override.keys()):
679
+ learn_options[k] = learn_options_override[k]
680
+ return learn_options
681
+
682
+
683
+ def fill_learn_options(learn_options_used_to_fill, learn_options_with_possible_missing):
684
+ """
685
+ only fill in keys that are missing from learn_options from learn_options_fill
686
+ """
687
+ if learn_options_used_to_fill is not None:
688
+ for k in list(learn_options_used_to_fill.keys()):
689
+ if k not in learn_options_with_possible_missing:
690
+ learn_options_with_possible_missing[k] = learn_options_used_to_fill[k]
691
+ return learn_options_with_possible_missing
692
+
693
+
694
+ def write_results(predictions, file_to_predict):
695
+ newfile = file_to_predict.replace(".csv", ".pred.csv")
696
+ data = pandas.read_csv(file_to_predict)
697
+ data['predictions'] = predictions
698
+ data.to_csv(newfile)
699
+ print("wrote results to %s" % newfile)
700
+ return data, newfile
701
+
702
+
703
+ if __name__ == '__main__':
704
+ try:
705
+ # Save the model without position information
706
+ nopos_model = save_final_model_V3(filename=os.path.expanduser('~/V3_model_nopos.pickle'), include_position=False)
707
+ print("Model without position information saved successfully.")
708
+
709
+ # Save the model with position information
710
+ full_model = save_final_model_V3(filename=os.path.expanduser('~/V3_model_full.pickle'), include_position=True)
711
+ print("Model with position information saved successfully.")
712
+
713
+ print("Both models saved successfully.")
714
+ except Exception as e:
715
+ print(f"An error occurred while saving models: {str(e)}")
716
+ traceback.print_exc()
src/utils/azimuth/predict.py ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import sklearn
3
+ from sklearn.metrics import roc_curve, auc
4
+ import sklearn.metrics
5
+ import sklearn.model_selection
6
+ import copy
7
+ from . import util
8
+ import time
9
+ # from . import metrics as ranking_metrics
10
+ # import azimuth.models.regression
11
+ # import azimuth.models.ensembles
12
+ # import azimuth.models.DNN
13
+ # import azimuth.models.baselines
14
+ import multiprocessing
15
+
16
+
17
+ def fill_in_truth_and_predictions(truth, predictions, fold, y_all, y_pred, learn_options, test):
18
+ truth[fold]['ranks'] = np.hstack((truth[fold]['ranks'],
19
+ y_all[learn_options['rank-transformed target name']].values[test].flatten()))
20
+
21
+ truth[fold]['thrs'] = np.hstack((truth[fold]['thrs'],
22
+ y_all[learn_options['binary target name']].values[test].flatten()))
23
+
24
+ if 'raw_target_name' in list(learn_options.keys()):
25
+ truth[fold]['raw'] = np.hstack((truth[fold]['raw'],
26
+ y_all[learn_options['raw target name']].values[test].flatten()))
27
+
28
+ predictions[fold] = np.hstack((predictions[fold], y_pred.flatten()))
29
+
30
+ return truth, predictions
31
+
32
+
33
+ def construct_filename(learn_options, TEST):
34
+ if "V" in learn_options:
35
+ filename = "V%s" % learn_options["V"]
36
+ else:
37
+ filename = "offV1"
38
+
39
+ if TEST:
40
+ filename = "TEST."
41
+
42
+ filename += learn_options["method"]
43
+ filename += '.order%d' % learn_options["order"]
44
+ # try:
45
+ # learn_options["target_name"] = ".%s" % learn_options["target_name"].split(" ")[1]
46
+ # except:
47
+ # pass
48
+ filename += learn_options["target_name"]
49
+ if learn_options["method"] == "GPy":
50
+ pass
51
+ # filename += ".R%d" % opt_options['num_restarts']
52
+ # filename += ".K%s" % learn_options['kerntype']
53
+ # if learn_options.has_key('degree'):
54
+ # filename += "d%d" % learn_options['degree']
55
+ # if learn_options['warped']:
56
+ # filename += ".Warp"
57
+ elif learn_options["method"] == "linreg":
58
+ filename += "." + learn_options["penalty"]
59
+ filename += "." + learn_options["cv"]
60
+
61
+ if learn_options["training_metric"] == "NDCG":
62
+ filename += ".NDGC_%d" % learn_options["NDGC_k"]
63
+ elif learn_options["training_metric"] == "AUC":
64
+ filename += ".AUC"
65
+ elif learn_options["training_metric"] == 'spearmanr':
66
+ filename += ".spearman"
67
+
68
+ print("filename = %s" % filename)
69
+ return filename
70
+
71
+ def print_summary(global_metric, results, learn_options, feature_sets, flags):
72
+ print("\nSummary:")
73
+ print(learn_options)
74
+ print("\t\tglobal %s=%.2f" % (learn_options['metric'], global_metric))
75
+ print("\t\tmedian %s across folds=%.2f" % (learn_options['metric'], np.median(results[0])))
76
+ print("\t\torder=%d" % learn_options["order"])
77
+ if 'kerntype' in learn_options: "\t\tkern type = %s" % learn_options['kerntype']
78
+ if 'degree' in learn_options: print("\t\tdegree=%d" % learn_options['degree'])
79
+ print("\t\ttarget_name=%s" % learn_options["target_name"])
80
+
81
+ for k in list(flags.keys()):
82
+ print('\t\t' + k + '=' + str(learn_options[k]))
83
+
84
+ print("\t\tfeature set:")
85
+ for set in list(feature_sets.keys()):
86
+ print("\t\t\t%s" % set)
87
+ print("\t\ttotal # features=%d" % results[4])
88
+
89
+ def extract_fpr_tpr_for_fold(aucs, fold, i, predictions, truth, y_binary, test, y_pred):
90
+ assert len(np.unique(y_binary))<=2, "if using AUC need binary targets"
91
+ fpr, tpr, _ = roc_curve(y_binary[test], y_pred)
92
+ roc_auc = auc(fpr, tpr)
93
+ aucs.append(roc_auc)
94
+
95
+ def extract_NDCG_for_fold(metrics, fold, i, predictions, truth, y_ground_truth, test, y_pred, learn_options):
96
+ NDCG_fold = ranking_metrics.ndcg_at_k_ties(y_ground_truth[test].flatten(), y_pred.flatten(), learn_options["NDGC_k"])
97
+ metrics.append(NDCG_fold)
98
+
99
+ def extract_spearman_for_fold(metrics, fold, i, predictions, truth, y_ground_truth, test, y_pred, learn_options):
100
+ spearman = util.spearmanr_nonan(y_ground_truth[test].flatten(), y_pred.flatten())[0]
101
+ assert not np.isnan(spearman), "found nan spearman"
102
+ metrics.append(spearman)
103
+
104
+ def get_train_test(test_gene, y_all, train_genes=None):
105
+ # this is a bit convoluted because the train_genes+test_genes may not add up to all genes
106
+ # for e.g. when we load up V3, but then use only V2, etc.
107
+
108
+ not_test = (y_all.index.get_level_values('Target gene').values != test_gene)
109
+
110
+ if train_genes is not None:
111
+ in_train_genes = np.zeros(not_test.shape, dtype=bool)
112
+ for t_gene in train_genes:
113
+ in_train_genes = np.logical_or(in_train_genes, (y_all.index.get_level_values('Target gene').values == t_gene))
114
+ train = np.logical_and(not_test, in_train_genes)
115
+ else:
116
+ train = not_test
117
+ #y_all['test'] as to do with extra pairs in V2
118
+ if test_gene == 'dummy':
119
+ test = train
120
+ else:
121
+ test = (y_all.index.get_level_values('Target gene').values== test_gene)
122
+
123
+ # convert to indices
124
+ test = np.where(test == True)[0]
125
+ train = np.where(train == True)[0]
126
+ return train, test
127
+
128
+
129
+ def cross_validate(y_all, feature_sets, learn_options=None, TEST=False, train_genes=None, CV=True):
130
+ '''
131
+ feature_sets is a dictionary of "set name" to pandas.DataFrame
132
+ one set might be single-nucleotide, position-independent features of order X, for e.g.
133
+ Method: "GPy" or "linreg"
134
+ Metric: NDCG (learning to rank metric, Normalized Discounted Cumulative Gain); AUC
135
+ Output: cv_score_median, gene_rocs
136
+ When CV=False, it trains on everything (and tests on everything, just to fit the code)
137
+ '''
138
+
139
+ print("range of y_all is [%f, %f]" % (np.min(y_all[learn_options['target_name']].values), np.max(y_all[learn_options['target_name']].values)))
140
+
141
+ allowed_methods = ["GPy", "linreg", "AdaBoostRegressor", "AdaBoostClassifier",
142
+ "DecisionTreeRegressor", "RandomForestRegressor",
143
+ "ARDRegression", "GPy_fs", "mean", "random", "DNN",
144
+ "lasso_ensemble", "doench", "logregL1", "sgrna_from_doench", 'SVC', 'xu_et_al']
145
+
146
+ assert learn_options["method"] in allowed_methods,"invalid method: %s" % learn_options["method"]
147
+ assert learn_options["method"] == "linreg" and learn_options['penalty'] == 'L2' or learn_options["weighted"] is None, "weighted only works with linreg L2 right now"
148
+
149
+ # construct filename from options
150
+ filename = construct_filename(learn_options, TEST)
151
+
152
+ print("Cross-validating genes...")
153
+ t2 = time.time()
154
+
155
+ y = np.array(y_all[learn_options["target_name"]].values[:,None],dtype=np.float64)
156
+
157
+ # concatenate feature sets in to one nparray, and get dimension of each
158
+ inputs, dim, dimsum, feature_names = util.concatenate_feature_sets(feature_sets)
159
+ #import pickle; pickle.dump([y, inputs, feature_names, learn_options], open("saved_models/inputs.p", "wb" )); import ipdb; ipdb.set_trace()
160
+
161
+ if not CV:
162
+ assert learn_options['cv'] == 'gene', 'Must use gene-CV when CV is False (I need to use all of the genes and stratified complicates that)'
163
+
164
+ # set-up for cross-validation
165
+ ## for outer loop, the one Doench et al use genes for
166
+ if learn_options["cv"] == "stratified":
167
+ assert "extra_pairs" not in learn_options or learn_options['extra pairs'], "can't use extra pairs with stratified CV, need to figure out how to properly account for genes affected by two drugs"
168
+ label_encoder = sklearn.preprocessing.LabelEncoder()
169
+ label_encoder.fit(y_all['Target gene'].values)
170
+ gene_classes = label_encoder.transform(y_all['Target gene'].values)
171
+ if 'n_folds' in list(learn_options.keys()):
172
+ n_folds = learn_options['n_folds']
173
+ elif learn_options['train_genes'] is not None and learn_options["test_genes"] is not None:
174
+ n_folds = len(learn_options["test_genes"])
175
+ else:
176
+ n_folds = len(learn_options['all_genes'])
177
+
178
+ cv = sklearn.cross_validation.StratifiedKFold(gene_classes, n_folds=n_folds, shuffle=True)
179
+ fold_labels = ["fold%d" % i for i in range(1,n_folds+1)]
180
+ if learn_options['num_genes_remove_train'] is not None:
181
+ raise NotImplementedException()
182
+ elif learn_options["cv"]=="gene":
183
+ cv = []
184
+
185
+ if not CV:
186
+ train_test_tmp = get_train_test('dummy', y_all) # get train, test split using a dummy gene
187
+ #train_tmp, test_tmp = train_test_tmp
188
+ # not a typo, using training set to test on as well, just for this case. Test set is not used
189
+ # for internal cross-val, etc. anyway.
190
+ #train_test_tmp = (train_tmp, train_tmp)
191
+ cv.append(train_test_tmp)
192
+ fold_labels = ["dummy_for_no_cv"]#learn_options['all_genes']
193
+
194
+ elif learn_options['train_genes'] is not None and learn_options["test_genes"] is not None:
195
+ assert learn_options['train_genes'] is not None and learn_options['test_genes'] is not None, "use both or neither"
196
+ for i, gene in enumerate(learn_options['test_genes']):
197
+ cv.append(get_train_test(gene, y_all, learn_options['train_genes']))
198
+ fold_labels = learn_options["test_genes"]
199
+ # if train and test genes are seperate, there should be only one fold
200
+ train_test_disjoint = set.isdisjoint(set(learn_options["train_genes"].tolist()), set(learn_options["test_genes"].tolist()))
201
+
202
+ else:
203
+ for i, gene in enumerate(learn_options['all_genes']):
204
+ train_test_tmp = get_train_test(gene, y_all)
205
+ cv.append(train_test_tmp)
206
+ fold_labels = learn_options['all_genes']
207
+
208
+ if learn_options['num_genes_remove_train'] is not None:
209
+ for i, (train,test) in enumerate(cv):
210
+ unique_genes = np.random.permutation(np.unique(np.unique(y_all['Target gene'][train])))
211
+ genes_to_keep = unique_genes[0:len(unique_genes) - learn_options['num_genes_remove_train']]
212
+ guides_to_keep = []
213
+ filtered_train = []
214
+ for j, gene in enumerate(y_all['Target gene']):
215
+ if j in train and gene in genes_to_keep:
216
+ filtered_train.append(j)
217
+ cv_i_orig = copy.deepcopy(cv[i])
218
+ cv[i] = (filtered_train, test)
219
+ if learn_options['num_genes_remove_train']==0:
220
+ assert np.all(cv_i_orig[0]==cv[i][0])
221
+ assert np.all(cv_i_orig[1]==cv[i][1])
222
+ print("# train/train after/before is %s, %s" % (len(cv[i][0]), len(cv_i_orig[0])))
223
+ print("# test/test after/before is %s, %s" % (len(cv[i][1]), len(cv_i_orig[1])))
224
+ else:
225
+ raise Exception("invalid cv options given: %s" % learn_options["cv"])
226
+
227
+ cv = [c for c in cv] #make list from generator, so can subset for TEST case
228
+ if TEST:
229
+ ind_to_use = [0]#[0,1]
230
+ cv = [cv[i] for i in ind_to_use]
231
+ fold_labels = [fold_labels[i] for i in ind_to_use]
232
+
233
+ truth = dict([(t, dict([(m, np.array([])) for m in ['raw', 'ranks', 'thrs']])) for t in fold_labels])
234
+ predictions = dict([(t, np.array([])) for t in fold_labels])
235
+
236
+ m = {}
237
+ metrics = []
238
+
239
+ #do the cross-validation
240
+ num_proc = learn_options["num_proc"]
241
+ if num_proc > 1:
242
+ num_proc = np.min([num_proc,len(cv)])
243
+ print("using multiprocessing with %d procs--one for each fold" % num_proc)
244
+ jobs = []
245
+ pool = multiprocessing.Pool(processes=num_proc)
246
+ for i,fold in enumerate(cv):
247
+ train,test = fold
248
+ print("working on fold %d of %d, with %d train and %d test" % (i, len(cv), len(train), len(test)))
249
+ if learn_options["method"]=="GPy":
250
+ job = pool.apply_async(azimuth.models.GP.gp_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options))
251
+ elif learn_options["method"]=="linreg":
252
+ job = pool.apply_async(azimuth.models.regression.linreg_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options))
253
+ elif learn_options["method"]=="logregL1":
254
+ job = pool.apply_async(azimuth.models.regression.logreg_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options))
255
+ elif learn_options["method"]=="AdaBoostRegressor":
256
+ job = pool.apply_async(azimuth.models.ensembles.adaboost_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options, False))
257
+ elif learn_options["method"]=="AdaBoostClassifier":
258
+ job = pool.apply_async(azimuth.models.ensembles.adaboost_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options, True))
259
+ elif learn_options["method"]=="DecisionTreeRegressor":
260
+ job = pool.apply_async(azimuth.models.ensembles.decisiontree_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options))
261
+ elif learn_options["method"]=="RandomForestRegressor":
262
+ job = pool.apply_async(azimuth.models.ensembles.randomforest_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options))
263
+ elif learn_options["method"]=="ARDRegression":
264
+ job = pool.apply_async(azimuth.models.regression.ARDRegression_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options))
265
+ elif learn_options["method"] == "random":
266
+ job = pool.apply_async(azimuth.models.baselines.random_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options))
267
+ elif learn_options["method"] == "mean":
268
+ job = pool.apply_async(azimuth.models.baselines.mean_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options))
269
+ elif learn_options["method"] == "SVC":
270
+ job = pool.apply_async(azimuth.models.baselines.SVC_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options))
271
+ elif learn_options["method"] == "DNN":
272
+ job = pool.apply_async(azimuth.models.DNN.DNN_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options))
273
+ elif learn_options["method"] == "lasso_ensemble":
274
+ job = pool.apply_async(azimuth.models.ensembles.LASSOs_ensemble_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options))
275
+ elif learn_options["method"] == "doench":
276
+ job = pool.apply_async(azimuth.models.baselines.doench_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options))
277
+ elif learn_options["method"] == "sgrna_from_doench":
278
+ job = pool.apply_async(azimuth.models.baselines.sgrna_from_doench_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options))
279
+ elif learn_options["method"] == "xu_et_al":
280
+ job = pool.apply_async(azimuth.models.baselines.xu_et_al_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options))
281
+ else:
282
+ raise Exception("did not find method=%s" % learn_options["method"])
283
+ jobs.append(job)
284
+ pool.close()
285
+ pool.join()
286
+ for i,fold in enumerate(cv):#i in range(0,len(jobs)):
287
+ y_pred, m[i] = jobs[i].get()
288
+ train,test = fold
289
+
290
+ if learn_options["training_metric"]=="AUC":
291
+ extract_fpr_tpr_for_fold(metrics, fold_labels[i], i, predictions, truth, y_all[learn_options["ground_truth_label"]].values, test, y_pred)
292
+ elif learn_options["training_metric"]=="NDCG":
293
+ extract_NDCG_for_fold(metrics, fold_labels[i], i, predictions, truth, y_all[learn_options["ground_truth_label"]].values, test, y_pred, learn_options)
294
+ elif learn_options["training_metric"] == 'spearmanr':
295
+ extract_spearman_for_fold(metrics, fold_labels[i], i, predictions, truth, y_all[learn_options["ground_truth_label"]].values, test, y_pred, learn_options)
296
+ else:
297
+ raise Exception("invalid 'training_metric' in learn_options: %s" % learn_options["training_metric"])
298
+
299
+ truth, predictions = fill_in_truth_and_predictions(truth, predictions, fold_labels[i], y_all, y_pred, learn_options, test)
300
+
301
+ pool.terminate()
302
+
303
+ else:
304
+ # non parallel version
305
+ for i,fold in enumerate(cv):
306
+ train,test = fold
307
+ if learn_options["method"]=="GPy":
308
+ y_pred, m[i] = gp_on_fold(azimuth.models.GP.feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)
309
+ elif learn_options["method"]=="linreg":
310
+ y_pred, m[i] = azimuth.models.regression.linreg_on_fold(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)
311
+ elif learn_options["method"]=="logregL1":
312
+ y_pred, m[i] = azimuth.models.regression.logreg_on_fold(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)
313
+ elif learn_options["method"]=="AdaBoostRegressor":
314
+ y_pred, m[i] = azimuth.models.ensembles.adaboost_on_fold(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options, classification=False)
315
+ elif learn_options["method"]=="AdaBoostClassifier":
316
+ y_pred, m[i] = azimuth.models.ensembles.adaboost_on_fold(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options, classification=True)
317
+ elif learn_options["method"]=="DecisionTreeRegressor":
318
+ y_pred, m[i] = azimuth.models.ensembles.decisiontree_on_fold(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)
319
+ elif learn_options["method"]=="RandomForestRegressor":
320
+ y_pred, m[i] = azimuth.models.ensembles.randomforest_on_fold(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)
321
+ elif learn_options["method"]=="ARDRegression":
322
+ y_pred, m[i] = azimuth.models.regression.ARDRegression_on_fold(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)
323
+ elif learn_options["method"]=="GPy_fs":
324
+ y_pred, m[i] = azimuth.models.GP.gp_with_fs_on_fold(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)
325
+ elif learn_options["method"] == "random":
326
+ y_pred, m[i] = azimuth.models.baselines.random_on_fold(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)
327
+ elif learn_options["method"] == "mean":
328
+ y_pred, m[i] = azimuth.models.baselines.mean_on_fold(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)
329
+ elif learn_options["method"] == "SVC":
330
+ y_pred, m[i] = azimuth.models.baselines.SVC_on_fold(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)
331
+ elif learn_options["method"] == "DNN":
332
+ y_pred, m[i] = azimuth.models.DNN.DNN_on_fold(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)
333
+ elif learn_options["method"] == "lasso_ensemble":
334
+ y_pred, m[i] = azimuth.models.ensembles.LASSOs_ensemble_on_fold(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)
335
+ elif learn_options["method"] == "doench":
336
+ y_pred, m[i] = azimuth.models.baselines.doench_on_fold(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)
337
+ elif learn_options["method"] == "sgrna_from_doench":
338
+ y_pred, m[i] = azimuth.models.baselines.sgrna_from_doench_on_fold(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)
339
+ elif learn_options["method"] == "xu_et_al":
340
+ y_pred, m[i] = azimuth.models.baselines.xu_et_al_on_fold(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)
341
+ else:
342
+ raise Exception("invalid method found: %s" % learn_options["method"])
343
+
344
+ if learn_options["training_metric"]=="AUC":
345
+ # fills in truth and predictions
346
+ extract_fpr_tpr_for_fold(metrics, fold_labels[i], i, predictions, truth, y_all[learn_options['ground_truth_label']].values, test, y_pred)
347
+ elif learn_options["training_metric"]=="NDCG":
348
+ extract_NDCG_for_fold(metrics, fold_labels[i], i, predictions, truth, y_all[learn_options["ground_truth_label"]].values, test, y_pred, learn_options)
349
+ elif learn_options["training_metric"] == 'spearmanr':
350
+ extract_spearman_for_fold(metrics, fold_labels[i], i, predictions, truth, y_all[learn_options["ground_truth_label"]].values, test, y_pred, learn_options)
351
+
352
+ truth, predictions = fill_in_truth_and_predictions(truth, predictions, fold_labels[i], y_all, y_pred, learn_options, test)
353
+
354
+ print("\t\tRMSE: ", np.sqrt(((y_pred - y[test])**2).mean()))
355
+ print("\t\tSpearman correlation: ", util.spearmanr_nonan(y[test], y_pred)[0])
356
+ print("\t\tfinished fold/gene %i of %i" % (i+1, len(fold_labels)))
357
+
358
+
359
+ cv_median_metric =[np.median(metrics)]
360
+ gene_pred = [(truth, predictions)]
361
+ print("\t\tmedian %s across gene folds: %.3f" % (learn_options["training_metric"], cv_median_metric[-1]))
362
+
363
+ t3 = time.time()
364
+ print("\t\tElapsed time for cv is %.2f seconds" % (t3-t2))
365
+ return metrics, gene_pred, fold_labels, m, dimsum, filename, feature_names
src/utils/azimuth/saved_models/V3_model_full.pickle ADDED
Binary file
 
src/utils/azimuth/saved_models/V3_model_nopos.pickle ADDED
Binary file
 
src/utils/azimuth/util.py ADDED
@@ -0,0 +1,1331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas
2
+ # import matplotlib.pylab as plt
3
+ # import pylab as pl # so can just grab qqplotting code from fastlmm directly
4
+ import scipy.stats
5
+ import scipy as sp
6
+ import numpy as np
7
+ import itertools
8
+ import sklearn.metrics
9
+ import Bio.SeqUtils.MeltingTemp as Tm
10
+ import Bio.Entrez as Entrez
11
+ import Bio.SeqUtils as SeqUtil
12
+ # from azimuth.features import microhomology
13
+ from Bio import SeqIO
14
+ # from . import metrics as ranking_metrics
15
+ import os
16
+ import pickle
17
+ import glob
18
+ #import azimuth
19
+ # import azimuth.models
20
+ #import azimuth.models.ensembles as ensembles
21
+ import Bio.Seq as Seq
22
+ import time
23
+ import scipy.stats as st
24
+ from . import util
25
+ import sys
26
+ import pandas as pd
27
+ # from . import corrstats
28
+
29
+ # def qqplot(pvals, fileout = None, alphalevel = 0.05,legend=None,xlim=None,ylim=None,fixaxes=True,addlambda=True,minpval=1e-20,title=None,h1=None,figsize=[5,5],grid=True, markersize=2):
30
+ # '''
31
+ # performs a P-value QQ-plot in -log10(P-value) space
32
+ # -----------------------------------------------------------------------
33
+ # Args:
34
+ # pvals P-values, for multiple methods this should be a list (each element will be flattened)
35
+ # fileout if specified, the plot will be saved to the file (optional)
36
+ # alphalevel significance level for the error bars (default 0.05)
37
+ # if None: no error bars are plotted
38
+ # legend legend string. For multiple methods this should be a list
39
+ # xlim X-axis limits for the QQ-plot (unit: -log10)
40
+ # ylim Y-axis limits for the QQ-plot (unit: -log10)
41
+ # fixaxes Makes xlim=0, and ylim=max of the two ylimits, so that plot is square
42
+ # addlambda Compute and add genomic control to the plot, bool
43
+ # title plot title, string (default: empty)
44
+ # h1 figure handle (default None)
45
+ # figsize size of the figure. (default: [5,5])
46
+ # grid boolean: use a grid? (default: True)
47
+ # Returns: fighandle, qnull, qemp
48
+ # -----------------------------------------------------------------------
49
+ # '''
50
+ # distr = 'log10'
51
+ # import pylab as pl
52
+ # if type(pvals)==list:
53
+ # pvallist=pvals
54
+ # else:
55
+ # pvallist = [pvals]
56
+ # if type(legend)==list:
57
+ # legendlist=legend
58
+ # else:
59
+ # legendlist = [legend]
60
+ #
61
+ # if h1 is None:
62
+ # h1=pl.figure(figsize=figsize)
63
+ #
64
+ # pl.grid(b=grid, alpha = 0.5)
65
+ #
66
+ # maxval = 0
67
+ #
68
+ # for i in range(len(pvallist)):
69
+ # pval =pvallist[i].flatten()
70
+ # M = pval.shape[0]
71
+ # pnull = (0.5 + sp.arange(M))/M
72
+ # # pnull = np.sort(np.random.uniform(size = tests))
73
+ #
74
+ # pval[pval<minpval]=minpval
75
+ # pval[pval>=1]=1
76
+ #
77
+ # if distr == 'chi2':
78
+ # qnull = st.chi2.isf(pnull, 1)
79
+ # qemp = (st.chi2.isf(sp.sort(pval),1))
80
+ # xl = 'LOD scores'
81
+ # yl = '$\chi^2$ quantiles'
82
+ #
83
+ # if distr == 'log10':
84
+ # qnull = -sp.log10(pnull)
85
+ # qemp = -sp.log10(sp.sort(pval)) #sorts the object, returns nothing
86
+ # xl = '-log10(P) observed'
87
+ # yl = '-log10(P) expected'
88
+ # if not (sp.isreal(qemp)).all(): raise Exception("imaginary qemp found")
89
+ # if qnull.max>maxval:
90
+ # maxval = qnull.max()
91
+ # pl.plot(qnull, qemp, '.', markersize=markersize)
92
+ # #pl.plot([0,qemp.max()], [0,qemp.max()],'r')
93
+ # if addlambda:
94
+ # lambda_gc = estimate_lambda(pval)
95
+ # print("lambda=%1.4f" % lambda_gc)
96
+ # #pl.legend(["gc="+ '%1.3f' % lambda_gc],loc=2)
97
+ # # if there's only one method, just print the lambda
98
+ # if len(pvallist) == 1:
99
+ # legendlist=["$\lambda_{GC}=$%1.4f" % lambda_gc]
100
+ # # otherwise add it at the end of the name
101
+ # else:
102
+ # legendlist[i] = legendlist[i] + " ($\lambda_{GC}=$%1.4f)" % lambda_gc
103
+ #
104
+ # addqqplotinfo(qnull,M,xl,yl,xlim,ylim,alphalevel,legendlist,fixaxes)
105
+ #
106
+ # if title is not None:
107
+ # pl.title(title)
108
+ #
109
+ # if fileout is not None:
110
+ # pl.savefig(fileout)
111
+ #
112
+ # return h1,qnull, qemp,
113
+
114
+
115
+ # def qqplotp(pv,fileout = None, alphalevel = 0.05,legend=None,xlim=None,ylim=None,ycoord=10,plotsize="652x526",title=None,dohist=True, numbins=50, figsize=[5,5], markersize=2):
116
+ # '''
117
+ # Read in p-values from filein and make a qqplot adn histogram.
118
+ # If fileout is provided, saves the qqplot only at present.
119
+ # Searches through p until one is found. '''
120
+ #
121
+ # import pylab as pl
122
+ # pl.ion()
123
+ #
124
+ # fs=8
125
+ # h1=qqplot(pv, fileout, alphalevel,legend,xlim,ylim,addlambda=True, figsize=figsize, markersize=markersize)
126
+ # #lambda_gc=estimate_lambda(pv)
127
+ # #pl.legend(["gc="+ '%1.3f' % lambda_gc],loc=2)
128
+ # pl.title(title,fontsize=fs)
129
+ #
130
+ # wm=pl.get_current_fig_manager()
131
+ # #e.g. "652x526+100+10
132
+ # xcoord=100
133
+ # #wm.window.wm_geometry(plotsize + "+" + str(xcoord) + "+" + str(ycoord))
134
+ #
135
+ # if dohist:
136
+ # h2=pvalhist(pv, numbins=numbins, figsize=figsize)
137
+ # pl.title(title,fontsize=fs)
138
+ # #wm=pl.get_current_fig_manager()
139
+ # width_height=plotsize.split("x")
140
+ # buffer=10
141
+ # xcoord=int(xcoord + float(width_height[0])+buffer)
142
+ # #wm.window.wm_geometry(plotsize + "+" + str(xcoord) + "+" + str(ycoord))
143
+ # else: h2=None
144
+ #
145
+ # return h1,h2
146
+
147
+ # def addqqplotinfo(qnull,M,xl='-log10(P) observed',yl='-log10(P) expected',xlim=None,ylim=None,alphalevel=0.05,legendlist=None,fixaxes=False):
148
+ # distr='log10'
149
+ # pl.plot([0,qnull.max()], [0,qnull.max()],'k')
150
+ # pl.ylabel(xl)
151
+ # pl.xlabel(yl)
152
+ # if xlim is not None:
153
+ # pl.xlim(xlim)
154
+ # if ylim is not None:
155
+ # pl.ylim(ylim)
156
+ # if alphalevel is not None:
157
+ # if distr == 'log10':
158
+ # betaUp, betaDown, theoreticalPvals = _qqplot_bar(M=M,alphalevel=alphalevel,distr=distr)
159
+ # lower = -sp.log10(theoreticalPvals-betaDown)
160
+ # upper = -sp.log10(theoreticalPvals+betaUp)
161
+ # pl.fill_between(-sp.log10(theoreticalPvals),lower,upper,color="grey",alpha=0.5)
162
+ # #pl.plot(-sp.log10(theoreticalPvals),lower,'g-.')
163
+ # #pl.plot(-sp.log10(theoreticalPvals),upper,'g-.')
164
+ # if legendlist is not None:
165
+ # leg = pl.legend(legendlist, loc=4, numpoints=1)
166
+ # # set the markersize for the legend
167
+ # for lo in leg.legendHandles:
168
+ # lo.set_markersize(10)
169
+ #
170
+ # if fixaxes:
171
+ # fix_axes()
172
+
173
+ def _qqplot_bar(M=1000000, alphalevel = 0.05,distr = 'log10'):
174
+ '''
175
+ calculate error bars for a QQ-plot
176
+ --------------------------------------------------------------------
177
+ Input:
178
+ ------------- ----------------------------------------------------
179
+ M number of points to compute error bars
180
+ alphalevel significance level for the error bars (default 0.05)
181
+ distr space in which the error bars are implemented
182
+ Note only log10 is implemented (default 'log10')
183
+ --------------------------------------------------------------------
184
+ Returns:
185
+ ------------- ----------------------------------------------------
186
+ betaUp upper error bars
187
+ betaDown lower error bars
188
+ theoreticalPvals theoretical P-values under uniform
189
+ --------------------------------------------------------------------
190
+ '''
191
+
192
+
193
+ #assumes 'log10'
194
+
195
+ mRange=10**(sp.arange(sp.log10(0.5),sp.log10(M-0.5)+0.1,0.1));#should be exp or 10**?
196
+ numPts=len(mRange);
197
+ betaalphaLevel=sp.zeros(numPts);#down in the plot
198
+ betaOneMinusalphaLevel=sp.zeros(numPts);#up in the plot
199
+ betaInvHalf=sp.zeros(numPts);
200
+ for n in range(numPts):
201
+ m=mRange[n]; #numplessThanThresh=m;
202
+ betaInvHalf[n]=st.beta.ppf(0.5,m,M-m);
203
+ betaalphaLevel[n]=st.beta.ppf(alphalevel,m,M-m);
204
+ betaOneMinusalphaLevel[n]=st.beta.ppf(1-alphalevel,m,M-m);
205
+ pass
206
+ betaDown=betaInvHalf-betaalphaLevel;
207
+ betaUp=betaOneMinusalphaLevel-betaInvHalf;
208
+
209
+ theoreticalPvals=mRange/M;
210
+ return betaUp, betaDown, theoreticalPvals
211
+
212
+
213
+
214
+ # def fix_axes(buffer=0.1):
215
+ # '''
216
+ # Makes x and y max the same, and the lower limits 0.
217
+ # '''
218
+ # maxlim=max(pl.xlim()[1],pl.ylim()[1])
219
+ # pl.xlim([0-buffer,maxlim+buffer])
220
+ # pl.ylim([0-buffer,maxlim+buffer])
221
+
222
+ def estimate_lambda(pv):
223
+ '''
224
+ estimate the lambda for a given array of P-values
225
+ ------------------------------------------------------------------
226
+ pv numpy array containing the P-values
227
+ ------------------------------------------------------------------
228
+ L lambda value
229
+ ------------------------------------------------------------------
230
+ '''
231
+ LOD2 = sp.median(st.chi2.isf(pv, 1))
232
+ L = (LOD2/0.456)
233
+ return L
234
+
235
+
236
+ # def pvalhist(pv,numbins=50,linewidth=3.0,linespec='--r', figsize=[5,5]):
237
+ # '''
238
+ # Plots normalized histogram, plus theoretical null-only line.
239
+ # '''
240
+ # h2=pl.figure(figsize=figsize)
241
+ # [nn,bins,patches]=pl.hist(pv,numbins,normed=True)
242
+ # pl.plot([0, 1],[1,1],linespec,linewidth=linewidth)
243
+
244
+
245
+
246
+ def get_pval_from_predictions(m0_predictions, m1_predictions, ground_truth, twotailed=False, method='steiger'):
247
+ '''
248
+ If twotailed==False, then need to check that the one of corr0 and corr1 that is higher is the correct one
249
+ '''
250
+ from . import corrstats
251
+ n0 = len(m0_predictions)
252
+ n1 = len(m1_predictions)
253
+ n2 = len(ground_truth)
254
+ assert(n0==n1)
255
+ assert(n0==n2)
256
+ corr0, _ = scipy.stats.spearmanr(m0_predictions, ground_truth)
257
+ corr1, _ = scipy.stats.spearmanr(m1_predictions, ground_truth)
258
+ corr01, _ =scipy.stats.spearmanr(m0_predictions, m1_predictions)
259
+ t2, pv = corrstats.dependent_corr(corr0, corr1, corr01, n0, twotailed=twotailed, method=method)
260
+ return t2, pv, corr0, corr1, corr01
261
+
262
+ def get_thirty_one_mer_data():
263
+ '''
264
+ Load up our processed data file for all of V1 and V2, make a 31mer so that
265
+ we can use the SSC trained model to compare to
266
+ Assumes we call this from the analysis subdirectory
267
+ '''
268
+ myfile = r"..\data\FC_RES_5304.csv"
269
+ newfile = r"..\data\FC_RES_5304_w_31mer.csv"
270
+ data = pd.read_csv(myfile)
271
+ thirty_one_mer = []
272
+ for i in range(data.shape[0]):
273
+ thirty_one_mer.append(convert_to_thirty_one(data.iloc[i]["30mer"], data.iloc[i]["Target"], data.iloc[i]["Strand"]))
274
+ data["31mer"] = thirty_one_mer
275
+ data.to_csv(newfile)
276
+
277
+
278
+ def guide_positional_features(guide_seq, gene, strand):
279
+ """
280
+ Given a guide sequence, a gene name, and strand (e.g. "sense"), return the (absolute) nucleotide cut position, and the percent amino acid.
281
+ From John's email:
282
+ the cut site is always 3nts upstream of the NGG PAM:
283
+ 5' - 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 <cut> 18 19 20 N G G - 3'
284
+ To calculate percent protein, we determined what amino acid number was being cut and just divided by the total number of amino acids. In the case where the cutsite was between two amino acid codons, I believe we rounded down
285
+
286
+ """
287
+
288
+ guide_seq = Seq.Seq(guide_seq)
289
+ gene_seq = Seq.Seq(util.get_gene_sequence(gene)).reverse_complement()
290
+ if strand=='sense':
291
+ guide_seq = guide_seq.reverse_complement()
292
+ ind = gene_seq.find(guide_seq)
293
+ if ind ==-1:
294
+ print("returning None, could not find guide %s in gene %s" % (guide_seq, gene))
295
+ return ""
296
+ assert gene_seq[ind:(ind+len(guide_seq))]==guide_seq, "match not right"
297
+ ## now get what we want from this:
298
+ import ipdb; ipdb.set_trace()
299
+ raise NotImplementedError("incomplete implentation for now")
300
+
301
+
302
+ def convert_to_thirty_one(guide_seq, gene, strand):
303
+ '''
304
+ Given a guide sequence, a gene name, and strand (e.g. "sense"), return a 31mer string which is our 30mer,
305
+ plus one more at the end.
306
+ '''
307
+ guide_seq = Seq.Seq(guide_seq)
308
+ gene_seq = Seq.Seq(get_gene_sequence(gene)).reverse_complement()
309
+ if strand=='sense':
310
+ guide_seq = guide_seq.reverse_complement()
311
+ ind = gene_seq.find(guide_seq)
312
+ if ind ==-1:
313
+ print("returning sequence+'A', could not find guide %s in gene %s" % (guide_seq, gene))
314
+ return gene_seq + 'A'
315
+ assert gene_seq[ind:(ind+len(guide_seq))]==guide_seq, "match not right"
316
+ #new_mer = gene_seq[ind:(ind+len(guide_seq))+1] #looks correct, but is wrong, due to strand frame-of-reference
317
+ new_mer = gene_seq[(ind-1):(ind+len(guide_seq))] #this actually tacks on an extra one at the end for some reason
318
+ if strand=='sense':
319
+ new_mer = new_mer.reverse_complement()
320
+ return str(new_mer)
321
+
322
+ def concatenate_feature_sets(feature_sets, keys=None):
323
+ '''
324
+ Given a dictionary of sets of features, each in a Pandas.DataFrame,
325
+ concatenate them together to form one big np.array, and get the dimension
326
+ of each set
327
+ Returns: inputs, dim
328
+ '''
329
+ assert feature_sets != {}, "no feature sets present"
330
+ if keys is None:
331
+ keys = list(feature_sets.keys())
332
+
333
+ F = feature_sets[keys[0]].shape[0]
334
+ for set in list(feature_sets.keys()):
335
+ F2 = feature_sets[set].shape[0]
336
+ assert F == F2, "not same # individuals for features %s and %s" % (keys[0], set)
337
+
338
+ N = feature_sets[keys[0]].shape[0]
339
+ inputs = np.zeros((N, 0))
340
+ feature_names = []
341
+ dim = {}
342
+ dimsum = 0
343
+ for set in keys:
344
+ inputs_set = feature_sets[set].values
345
+ dim[set] = inputs_set.shape[1]
346
+ dimsum += dim[set]
347
+ inputs = np.hstack((inputs, inputs_set))
348
+ feature_names.extend(feature_sets[set].columns.tolist())
349
+
350
+ if False:
351
+ inputs.shape
352
+ for j in keys: print(j + str(feature_sets[j].shape))
353
+ import ipdb; ipdb.set_trace()
354
+
355
+ #print "final size of inputs matrix is (%d, %d)" % inputs.shape
356
+ return inputs, dim, dimsum, feature_names
357
+
358
+ def extract_individual_level_data(one_result):
359
+ '''
360
+ Extract predictions and truth for each fold
361
+ Returns: ranks, predictions
362
+
363
+ assumes that results here is the value for a results dictionary for one key, i.e. one entry in a dictionary loaded up from saved results with pickle
364
+ e.g. all_results, all_learn_options = pickle.load(some_results_file)
365
+ then call extract_individual_level_data(one_results = all_results['firstkey'])
366
+ then, one_results contains: metrics, gene_pred, fold_labels, m, dimsum, filename, feature_names
367
+ '''
368
+ metrics, gene_pred, fold_labels, m, dimsum, filename, feature_names = one_result
369
+ all_true_ranks = np.empty(0)
370
+ all_pred = np.empty(0)
371
+ for f in list(fold_labels):
372
+ these_ranks = gene_pred[0][0][f]['ranks'] #similar for thrs
373
+ these_pred = gene_pred[0][1][f]
374
+ all_true_ranks = np.concatenate((all_true_ranks, these_ranks))
375
+ all_pred = np.concatenate((all_pred, these_pred))
376
+ return all_true_ranks, all_pred
377
+
378
+ def spearmanr_nonan(x,y):
379
+ '''
380
+ same as scipy.stats.spearmanr, but if all values are unique, returns 0 instead of nan
381
+ (Output: rho, pval)
382
+ '''
383
+ r, p = st.spearmanr(x, y)
384
+ if np.isnan(p):
385
+ if len(np.unique(x))==1 or len(np.unique(y))==1:
386
+ print("WARNING: spearmanr is nan due to unique values, setting to 0")
387
+ p = 0.0
388
+ r = 0.0
389
+ else:
390
+ raise Exception("found nan spearman")
391
+ assert not np.isnan(r)
392
+ return r, p
393
+
394
+
395
+
396
+ def impute_gene_position(gene_position):
397
+ '''
398
+ Some amino acid cut position and percent peptide are blank because of stop codons, but
399
+ we still want a number for these, so just set them to 101 as a proxy
400
+ '''
401
+
402
+ gene_position['Percent Peptide'] = gene_position['Percent Peptide'].fillna(101.00)
403
+
404
+ if 'Amino Acid Cut position' in gene_position.columns:
405
+ gene_position['Amino Acid Cut position'] = gene_position['Amino Acid Cut position'].fillna(gene_position['Amino Acid Cut position'].mean())
406
+
407
+ return gene_position
408
+
409
+
410
+ def datestamp(appendrandom=False):
411
+ import datetime
412
+ now = datetime.datetime.now()
413
+ s = str(now)[:19].replace(" ","_").replace(":","_")
414
+ if appendrandom:
415
+ import random
416
+ s += "_" + str(random.random())[2:]
417
+ return s
418
+
419
+
420
+ def get_gene_sequence(gene_name):
421
+ try:
422
+ gene_file = '../../gene_sequences/%s_sequence.txt' % gene_name
423
+ #gene_file = '../gene_sequences/%s_sequence.txt' % gene_name
424
+ #gene_file = 'gene_sequences/%s_sequence.txt' % gene_name
425
+ with open(gene_file, 'rb') as f:
426
+ seq = f.read()
427
+ seq = seq.replace('\r\n', '')
428
+ except:
429
+ raise Exception("could not find gene sequence file %s, please see examples and generate one for your gene as needed, with this filename" % gene_file)
430
+
431
+ return seq
432
+
433
+ # gene_positions = {'CCDC101': [28553928,28591790]}
434
+ # search = Entrez.esearch(db="gene", term='%s[Gene Name] AND Homo Sapiens[Organism]' % (gene_name))
435
+ # records = Entrez.read(search)
436
+
437
+ # if len(records['IdList']) > 1:
438
+ # print "warning, multiple hits found for entrez gene search %s" % gene_name
439
+
440
+ # elink = Entrez.read(Entrez.elink(dbfrom="gene", db='nucleotide', id=records['IdList'][0]))
441
+ # nucl_id = elink[0]['LinkSetDb'][3]
442
+
443
+ # cut = False
444
+ # if nucl_id['LinkName'] != 'gene_nuccore_refseqgene':
445
+ # if gene_name in gene_positions.keys():
446
+ # nucl_id = elink[0]['LinkSetDb'][0]['Link'][0]['Id']
447
+ # cut = True
448
+ # else:
449
+ # print "sorry not enough information to return sequence"
450
+ # return None
451
+ # else:
452
+ # nucl_id = nucl_id['Link'][0]['Id']
453
+
454
+ # handle = Entrez.efetch(db="nucleotide", id=nucl_id, rettype="gb", retmode="text")
455
+ # record = SeqIO.read(handle, "genbank")
456
+ # handle.close()
457
+
458
+ # if cut:
459
+ # start, end = gene_positions[gene_name]
460
+ # return str(record.seq)[start:end]
461
+ # else:
462
+ # return str(record.seq)
463
+
464
+
465
+ def target_genes_stats(genes=['HPRT1', 'TADA1', 'NF2', 'TADA2B', 'NF1', 'CUL3', 'MED12', 'CCDC101']):
466
+ for gene in genes:
467
+ seq = get_gene_sequence(gene)
468
+ if seq != None:
469
+ print('%s \t\t\t\t len: %d \t GCcont: %.3f \t Temp: %.4f \t molweight: %.4f' % (gene, len(seq), SeqUtil.GC(seq), Tm.Tm_staluc(seq, rna=False), SeqUtil.molecular_weight(seq, 'DNA')))
470
+
471
+
472
+ def ranktrafo(data):
473
+ X = data.values[:, None]
474
+ Is = X.argsort(axis=0)
475
+ RV = sp.zeros_like(X)
476
+ rank = sp.zeros_like(X)
477
+ for i in range(X.shape[1]):
478
+ x = X[:,i]
479
+ rank = sp.stats.rankdata(x)
480
+ rank /= (X.shape[0]+1)
481
+ RV[:,i] = sp.sqrt(2) * sp.special.erfinv(2*rank-1)
482
+
483
+ return RV.flatten()
484
+
485
+ def get_ranks(y, thresh=0.8, prefix="", flip=False, col_name='score'):
486
+ """
487
+ y should be a DataFrame with one column
488
+ thresh is the threshold at which to call it a knock-down or not
489
+ col_name = 'score' is only for V2 data
490
+ flip should be FALSE for both V1 and V2!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
491
+ """
492
+
493
+ if prefix is not None:
494
+ prefix = prefix + "_"
495
+
496
+ #y_rank = y.apply(ranktrafo)
497
+ y_rank = y.apply(sp.stats.mstats.rankdata)
498
+ y_rank /= y_rank.max()
499
+
500
+ if flip:
501
+ y_rank = 1.0 - y_rank # before this line, 1-labels where associated with low ranks, this flips it around (hence the y_rank > thresh below)
502
+ # we should NOT flip (V2), see README.txt in ./data
503
+
504
+ y_rank.columns = [prefix + "rank"]
505
+ y_threshold = (y_rank > thresh)*1
506
+
507
+ y_threshold.columns = [prefix + "threshold"]
508
+
509
+ # JL: undo the log2 transform (not sure this matters?)
510
+ y_rank_raw = (2**y).apply(scipy.stats.mstats.rankdata)
511
+ y_rank_raw /= y_rank_raw.max()
512
+ if flip:
513
+ y_rank_raw = 1.0 - y_rank_raw
514
+ y_rank_raw.columns = [prefix + "rank raw"]
515
+ assert ~np.any(np.isnan(y_rank)), "found NaN ranks"
516
+
517
+ # divides into quantiles, but not used:
518
+ # y_quantized = pandas.DataFrame(data=pandas.qcut(y[col_name], 5, labels=np.arange(5.0))) # quantized vector
519
+ y_quantized = y_threshold.copy()
520
+ y_quantized.columns = [prefix + "quantized"]
521
+
522
+ return y_rank, y_rank_raw, y_threshold, y_quantized
523
+
524
+ def get_data(data, y_names, organism="human", target_gene=None):
525
+ outputs = pandas.DataFrame()
526
+ '''
527
+ this is called once for each gene (aggregating across cell types)
528
+ y_names are cell types
529
+ e.g. call: X_CD13, Y_CD13 = get_data(cd13, y_names=['NB4 CD13', 'TF1 CD13'])
530
+ '''
531
+
532
+ #generate ranks for each cell type before aggregating to match what is in Doench et al
533
+ thresh = 0.8
534
+ for y_name in y_names: # for each cell type
535
+ y = pandas.DataFrame(data[y_name])
536
+ # these thresholds/quantils are not used:
537
+ y_rank, y_rank_raw, y_threshold, y_quantiles = get_ranks(y, thresh=thresh, flip=False, col_name=y_name)
538
+ y_rank.columns = [y_name + " rank"]
539
+ y_rank_raw.columns = [y_name + " rank raw"]
540
+ y_threshold.columns = [y_name + " threshold"]
541
+
542
+ outputs = pandas.concat([outputs, y, y_rank, y_threshold, y_rank_raw], axis=1)
543
+
544
+
545
+ #aggregated rank across cell types
546
+ average_activity = pandas.DataFrame(outputs[[y_name for y_name in y_names]].mean(1))
547
+ average_activity.columns = ['average activity']
548
+
549
+ average_rank_from_avg_activity = get_ranks(average_activity, thresh=thresh, flip=False, col_name='average activity')[0]
550
+ average_rank_from_avg_activity.columns = ['average_rank_from_avg_activity']
551
+ average_threshold_from_avg_activity = (average_rank_from_avg_activity > thresh)*1
552
+ average_threshold_from_avg_activity.columns = ['average_threshold_from_avg_activity']
553
+
554
+ average_rank = pandas.DataFrame(outputs[[y_name + ' rank' for y_name in y_names]].mean(1))
555
+ average_rank.columns = ['average rank']
556
+ # higher ranks are better (when flip=False as it should be)
557
+ average_threshold = (average_rank > thresh)*1
558
+ average_threshold.columns = ['average threshold']
559
+
560
+ # undo the log2 trafo on the reads per million, apply rank trafo right away
561
+ average_rank_raw = pandas.DataFrame(outputs[[y_name+' rank raw' for y_name in y_names]].mean(1))
562
+ average_rank_raw.columns = ['average rank raw']
563
+ outputs = pandas.concat([outputs, average_rank, average_threshold, average_activity, average_rank_raw, average_rank_from_avg_activity, average_threshold_from_avg_activity], axis=1)
564
+
565
+ # import ipdb; ipdb.set_trace()
566
+
567
+ #sequence-specific computations
568
+ #features = featurize_data(data)
569
+ #strip out featurization to later
570
+ features = pandas.DataFrame(data['30mer'])
571
+
572
+ if organism == "human":
573
+ target_gene = y_names[0].split(' ')[1]
574
+
575
+ outputs['Target gene'] = target_gene
576
+ outputs['Organism'] = organism
577
+
578
+ features['Target gene'] = target_gene
579
+ features['Organism'] = organism
580
+ features['Strand'] = pandas.DataFrame(data['Strand'])
581
+
582
+ return features, outputs
583
+
584
+
585
+ # def plot_metrics(metrics, truth_and_predictions, target_genes, run_label, color=None, filename_prefix=None, learn_options=None):
586
+ #
587
+ # if learn_options["metric"] == 'AUC':
588
+ # best = truth_and_predictions[0]#[np.argmax(cv_scores)]
589
+ # plt.figure('ROC per gene')
590
+ # plt.figure('global ROC')
591
+ # plt.figure('AUC ROC per gene')
592
+ #
593
+ # all_truth = np.array([])
594
+ # all_predictions = np.array([])
595
+ # AUCs = []
596
+ # AUCs_labels = []
597
+ # for i, gene in enumerate(target_genes):
598
+ # if len(best[1][gene])==0:
599
+ # continue
600
+ # plt.figure('ROC per gene')
601
+ # plt.subplot(331+i)
602
+ # fpr, tpr, _ = sklearn.metrics.roc_curve(best[0][gene], best[1][gene])
603
+ # np.savetxt('../results/%s_ROC.txt' % gene, np.hstack((fpr[:, None], tpr[:, None])))
604
+ #
605
+ # roc_auc = sklearn.metrics.auc(fpr, tpr)
606
+ # AUCs.append(roc_auc)
607
+ # AUCs_labels.append(gene)
608
+ # plt.plot(fpr, tpr, label=run_label)
609
+ # plt.title(gene)
610
+ # h1 = plt.figure('global ROC')
611
+ # plt.plot(fpr, tpr, color=color, alpha=.2, linewidth=2.)
612
+ #
613
+ # all_truth = np.hstack((all_truth, best[0][gene]))
614
+ # all_predictions = np.hstack((all_predictions, best[1][gene]))
615
+ #
616
+ # plt.legend(loc=0)
617
+ #
618
+ # plt.figure('AUC ROC per gene')
619
+ # ax = plt.subplot(111)
620
+ # rect = ax.bar(list(range(len(AUCs))), AUCs, width=0.8)
621
+ # autolabel(ax,rect)
622
+ #
623
+ # ax.set_ylim((0.5, 1.0))
624
+ # ax.set_ylabel('AUC ROC')
625
+ # ax.set_xticks(np.array(list(range(len(AUCs)))) + 0.8 / 2)
626
+ # ax.set_xticklabels([t for t in AUCs_labels])
627
+ #
628
+ # fpr, tpr, _ = sklearn.metrics.roc_curve(all_truth, all_predictions)
629
+ # roc_auc = sklearn.metrics.auc(fpr, tpr)
630
+ # #print run_label, roc_auc
631
+ # plt.figure('global ROC')
632
+ # plt.plot(fpr, tpr, label=run_label + " AUC=%.2f" % roc_auc, color=color, linewidth=2.)
633
+ # plt.legend(loc=0)
634
+ # plt.xlabel('False Positive Rate')
635
+ # plt.ylabel('True Positive Rate')
636
+ # #np.savetxt('../results/global_ROC.txt', np.hstack((fpr[:, None], tpr[:, None])))
637
+ # #np.savetxt('../results/AUCs.txt', np.hstack((np.array([t for t in target_genes])[:, None], np.array(AUCs)[:, None])), fmt='%s')
638
+ #
639
+ # if filename_prefix != None:
640
+ # plt.figure('global ROC')
641
+ # plt.savefig(filename_prefix+'globalROC.png')
642
+ #
643
+ # plt.figure('ROC per gene')
644
+ # plt.savefig(filename_prefix+'ROC_per_gene.png')
645
+ #
646
+ # plt.figure('AUC ROC per gene')
647
+ # plt.savefig(filename_prefix+'AUCROC_barplot.png')
648
+ # return roc_auc
649
+ # else:
650
+ # plt.figure('NDCG per gene')
651
+ # ax = plt.subplot(111)
652
+ # rect = ax.bar(list(range(len(metrics))), metrics, width=0.8)
653
+ # autolabel(ax,rect)
654
+ # ax.set_ylim((0.0, 1.2))
655
+ # ax.set_ylabel('NDCG')
656
+ # ax.set_xticks(np.array(list(range(len(metrics)))) + 0.8 / 2)
657
+ # ax.set_xticklabels([t for t in target_genes])
658
+ #
659
+ # truth, predictions = truth_and_predictions[0]
660
+ # all_truth = np.array([])
661
+ # all_predictions = np.array([])
662
+ #
663
+ # for i, gene in enumerate(target_genes):
664
+ # if len(predictions[gene])==0:
665
+ # continue
666
+ #
667
+ # all_truth = np.hstack((all_truth, truth[gene]))
668
+ # all_predictions = np.hstack((all_predictions, predictions[gene]))
669
+ #
670
+ # sorted = all_predictions[np.argsort(all_truth).flatten()[::-1]]
671
+ # sortedgt = np.sort(all_truth).flatten()[::-1]
672
+ # NDCG_total = ranking_metrics.ndcg_at_k_custom_n(sorted, learn_options["NDGC_k"], sortedgt)
673
+ #
674
+ # if filename_prefix != None:
675
+ # plt.figure('NDCG per gene')
676
+ # plt.savefig(filename_prefix+'NDCG_barplot.png')
677
+ #
678
+ # return NDCG_total
679
+
680
+ def autolabel(ax, rects, strfrm='%.2f'):
681
+ '''
682
+ Automatically add value over each bar in bar chart
683
+ http://matplotlib.org/1.4.2/examples/api/barchart_demo.html
684
+ '''
685
+ for rect in rects:
686
+ height = rect.get_height()
687
+ ax.text(rect.get_x()+rect.get_width()/2., 1.05*height, strfrm % float(height),
688
+ ha='center', va='bottom')
689
+
690
+
691
+ def create_cachedir(dirname='./cache/default'):
692
+ if os.path.exists(dirname):
693
+ return dirname
694
+ else:
695
+ os.makedirs(dirname)
696
+ return dirname
697
+
698
+ def dcg(relevances, rank=20):
699
+ relevances = np.asarray(relevances)[:rank]
700
+ n_relevances = len(relevances)
701
+ if n_relevances == 0:
702
+ return 0.
703
+ discounts = np.log2(np.arange(n_relevances) + 2)
704
+ return np.sum(relevances / discounts)
705
+
706
+ def ndcgk(relevances, rank=20):
707
+ best_dcg = dcg(sorted(relevances, reverse=True), rank)
708
+ if best_dcg == 0:
709
+ return 0.
710
+ return dcg(relevances, rank) / best_dcg
711
+
712
+ def extract_feature_from_model(method, results, split):
713
+ model_type = results[method][3][split]
714
+ if isinstance(model_type, sklearn.linear_model.coordinate_descent.ElasticNet):
715
+ tmp_imp = results[method][3][split].coef_[:, None]
716
+ elif isinstance(model_type, sklearn.ensemble.GradientBoostingRegressor):
717
+ tmp_imp = results[method][3][split].feature_importances_[:, None]
718
+ else:
719
+ raise Exception("need to add model %s to feature extraction" % model_type)
720
+ return tmp_imp
721
+
722
+ def extract_feature_from_model_sum(method, results, split, indexes):
723
+ model_type = results[method][3][split]
724
+ if isinstance(model_type, sklearn.linear_model.coordinate_descent.ElasticNet):
725
+ tmp_imp = np.sum(results[method][3][split].coef_[indexes])
726
+ elif isinstance(model_type, sklearn.ensemble.GradientBoostingRegressor):
727
+ tmp_imp = np.sum(results[method][3][split].feature_importances_[indexes])
728
+ else:
729
+ raise Exception("need to add model %s to feature extraction" % model_type)
730
+ return tmp_imp
731
+
732
+ def feature_importances(results, fontsize=16, figsize=(14, 8)):
733
+ for method in list(results.keys()):
734
+ feature_names = results[method][6]
735
+
736
+ seen = set()
737
+ uniq = []
738
+ for ft in feature_names:
739
+ if ft not in seen:
740
+ uniq.append(ft)
741
+ else:
742
+ seen.add(ft)
743
+ if len(seen) > 0:
744
+ raise Exception("feature name appears more than once: %s" % seen)
745
+
746
+ # grouped_feat = {'pd_order1' : [i for i,s in enumerate(feature_names) if s.startswith("_pd.Order1")],
747
+ # 'pd_order2' : [i for i,s in enumerate(feature_names) if s.startswith("_pd.Order2")],
748
+ # 'pd_order3' : [i for i,s in enumerate(feature_names) if s.startswith("_pd.Order3")],
749
+ # 'pi_order1' : [i for i,s in enumerate(feature_names) if s.startswith("_pi.Order1")],
750
+ # 'pi_order2' : [i for i,s in enumerate(feature_names) if s.startswith("_pi.Order2")],
751
+ # 'pi_order3' : [i for i,s in enumerate(feature_names) if s.startswith("_pi.Order3")],
752
+ # 'NGGX_pd.Order2' : [i for i,s in enumerate(feature_names) if s.startswith("NGGX_pd.Order2")]
753
+ # }
754
+
755
+ pd_order1, pi_order1, pd_order2, pi_order2, nggx = [], [], [], [], []
756
+ for i,s in enumerate(feature_names):
757
+ if 'False' in s:
758
+ continue
759
+ elif "_" in s:
760
+ nucl, pos = s.split('_')
761
+ if len(nucl) == 1:
762
+ pd_order1.append(i)
763
+ elif len(nucl) == 2:
764
+ pd_order2.append(i)
765
+ elif "NGGX_pd.Order2" in s:
766
+ nggx.append(i)
767
+ else:
768
+ nucl = s
769
+ if len(nucl) == 1:
770
+ pi_order1.append(i)
771
+ elif len(nucl) == 2:
772
+ pi_order2.append(i)
773
+
774
+ grouped_feat = {'pd_order2': pd_order2,
775
+ 'pi_order2': pi_order2,
776
+ 'pd_order1': pd_order1,
777
+ 'pi_order1': pi_order1,
778
+ 'NGGX_pd.Order2': nggx,}
779
+
780
+ grouped_feat_ind = []
781
+ [grouped_feat_ind.extend(grouped_feat[a]) for a in list(grouped_feat.keys())]
782
+ remaining_features_ind = set.difference(set(range(len(feature_names))), set(grouped_feat_ind))
783
+
784
+ for i in remaining_features_ind:
785
+ grouped_feat[feature_names[i]] = [i]
786
+
787
+ feature_importances_grouped = {}
788
+ for k in grouped_feat:
789
+ if len(grouped_feat[k]) == 0:
790
+ continue
791
+ else:
792
+ for split in list(results[method][3].keys()):
793
+ split_feat_importance = extract_feature_from_model_sum(method, results, split, grouped_feat[k])
794
+ if k not in feature_importances_grouped:
795
+ feature_importances_grouped[k] = [split_feat_importance]
796
+ else:
797
+ feature_importances_grouped[k].append(split_feat_importance)
798
+
799
+ all_split_importances = None
800
+ for split in list(results[method][3].keys()):
801
+
802
+ split_feat_importance = extract_feature_from_model(method, results, split)
803
+
804
+ if all_split_importances is None:
805
+ all_split_importances = split_feat_importance.copy()
806
+ else:
807
+ all_split_importances = np.append(all_split_importances, split_feat_importance, axis=1)
808
+
809
+ avg_importance = np.mean(all_split_importances, axis=1)[:, None]
810
+ std_importance = np.std(all_split_importances, axis=1)[:, None]
811
+ imp_array = np.concatenate((np.array(feature_names)[:, None], avg_importance, std_importance), axis=1)
812
+
813
+ df = pandas.DataFrame(data=imp_array, columns=['Feature name', 'Mean feature importance', 'Std. Dev.'])
814
+ df = df.convert_objects(convert_numeric=True)
815
+
816
+ boxplot_labels = np.array([k for k in list(feature_importances_grouped.keys())])
817
+ boxplot_arrays = np.concatenate([np.array(feature_importances_grouped[k])[:, None] for k in boxplot_labels], axis=1)
818
+
819
+ feature_dictionary = {
820
+ 'pd_order2': 'position dep. order 2 ',
821
+ 'pd_order1': 'position dep. order 1 ',
822
+ 'pi_order1': 'position ind. order 1 ',
823
+ 'pi_order2': 'position ind. order 2 ',
824
+ '5mer_end_False': 'Tm (5mer end)',
825
+ '5mer_start_False': 'Tm (5mer start)',
826
+ 'Amino Acid Cut position': 'amino acid cut position ',
827
+ '8mer_middle_False': 'Tm (8mer middle)',
828
+ 'NGGX_pd.Order2': 'NGGN interaction ',
829
+ 'Tm global_False': 'Tm (30mer)',
830
+ 'Percent Peptide': 'percent peptide ',
831
+ }
832
+
833
+ for i in range(df.shape[0]):
834
+ thisfeat = df['Feature name'].iloc[i]
835
+ if thisfeat in list(feature_dictionary.keys()):
836
+ df['Feature name'].iloc[i] = feature_dictionary[thisfeat]
837
+
838
+ descriptive_labels = np.array([feature_dictionary[k] if k in list(feature_dictionary.keys()) else k + " " for k in boxplot_labels])
839
+
840
+ sorted_boxplot = np.argsort(np.median(boxplot_arrays, axis=0))[::-1]
841
+ boxplot_means = np.mean(boxplot_arrays, axis=0)[sorted_boxplot]
842
+ boxplot_std = np.std(boxplot_arrays, axis=0)[sorted_boxplot]
843
+
844
+ ind = np.arange(0, len(boxplot_labels)*2, 2)# farange(len(boxplot_labels))
845
+ width = 1.5
846
+ # plt.figure(figsize=figsize)
847
+ # plt.bar(ind, boxplot_means, width, color='#186499', yerr=boxplot_std, ecolor='k', edgecolor='none')
848
+
849
+ # ax = plt.gca()
850
+ # ax.set_ylabel('Average Gini importances', fontsize=fontsize)
851
+ # ax.set_xticks(ind+width/2.0 + 0.1)
852
+
853
+ # ax.set_xticklabels(descriptive_labels[sorted_boxplot], rotation=90, fontsize=fontsize)
854
+ # plt.ylim([0.0, 0.5])
855
+ # plt.subplots_adjust(top = 0.97, bottom = 0.4)
856
+
857
+ # plt.boxplot(boxplot_arrays[:, sorted_boxplot])
858
+ # plt.ylabel('Average Gini')
859
+ # plt.xticks(range(1, len(boxplot_labels)+1), np.array(boxplot_labels)[sorted_boxplot], rotation=70)
860
+ # plt.subplots_adjust(top = 0.97, bottom = 0.4)
861
+ return df
862
+
863
+ def check_learn_options_set(learn_options_set):
864
+ if learn_options_set is None:
865
+ return 'ranks'
866
+
867
+ non_binary_target_name_agree = True
868
+ non_binary_target_name = None
869
+ for l in list(learn_options_set.values()):
870
+ if non_binary_target_name is None:
871
+ non_binary_target_name = l["testing_non_binary_target_name"]
872
+ else:
873
+ assert non_binary_target_name == l["testing_non_binary_target_name"], "need to have same testing_non_binary_target_name across all learn options in a set for metrics to be comparable"
874
+ return non_binary_target_name
875
+
876
+ def get_all_metrics(results, learn_options_set=None, test_metrics=['spearmanr'], add_extras=False, force_by_gene=False):
877
+ """
878
+ 'metrics' here are the metrics used to evaluate
879
+ """
880
+ all_results = dict([(k, {}) for k in list(results.keys())])
881
+ genes = list(results[list(results.keys())[0]][1][0][0].keys())
882
+
883
+ for metric in test_metrics:
884
+ for method in list(all_results.keys()):
885
+ all_results[method][metric] = []
886
+
887
+ non_binary_target_name = check_learn_options_set(learn_options_set)
888
+
889
+ for method in list(results.keys()):
890
+ truth, predictions = results[method][1][0]
891
+ test_indices = results[method][-1]
892
+ tmp_genes = list(results[method][1][0][0].keys())
893
+ if len(tmp_genes) != len(tmp_genes) or np.any(tmp_genes==genes): "genes have changed, need to modify code"
894
+ all_truth_raw, all_truth_thrs, all_predictions = np.array([]), np.array([]), np.array([])
895
+
896
+ fpr_gene = {}
897
+ tpr_gene ={}
898
+ y_truth_thresh_all = np.array([])
899
+ y_pred_all = np.array([])
900
+
901
+ for gene in genes:
902
+ y_truth, y_pred = truth[gene], predictions[gene]
903
+ all_truth_raw = np.append(all_truth_raw, y_truth[non_binary_target_name])
904
+ all_truth_thrs = np.append(all_truth_thrs, y_truth['thrs'])
905
+ all_predictions = np.append(all_predictions, y_pred)
906
+
907
+ y_truth_thresh_all = np.append(y_truth_thresh_all, y_truth['thrs'])
908
+ y_pred_all = np.append(y_pred_all, y_pred)
909
+
910
+ if 'spearmanr' in test_metrics:
911
+ spearmanr = spearmanr_nonan(y_truth[non_binary_target_name], y_pred)[0]
912
+ all_results[method]['spearmanr'].append(spearmanr)
913
+
914
+ if 'spearmanr>2.5' in test_metrics:
915
+ selected = y_truth[non_binary_target_name] > 1.0
916
+ #spearmanr = sp.stats.spearmanr(y_truth[non_binary_target_name][selected], y_pred[selected])[0]
917
+ spearmanr = np.sqrt(np.mean((y_truth[non_binary_target_name][selected] - y_pred[selected])**2))
918
+ all_results[method]['spearmanr>2.5'].append(spearmanr)
919
+
920
+ if 'RMSE' in test_metrics:
921
+ rmse = np.sqrt(np.mean((y_truth[non_binary_target_name] - y_pred)**2))
922
+ all_results[method]['RMSE'].append(rmse)
923
+
924
+ if 'NDCG@5' in test_metrics:
925
+ ndcg = ranking_metrics.ndcg_at_k_ties(y_truth[non_binary_target_name], y_pred, 5)
926
+ all_results[method]['NDCG@5'].append(ndcg)
927
+
928
+ if 'NDCG@10' in test_metrics:
929
+ ndcg = ranking_metrics.ndcg_at_k_ties(y_truth[non_binary_target_name], y_pred, 10)
930
+ all_results[method]['NDCG@10'].append(ndcg)
931
+
932
+ if 'NDCG@20' in test_metrics:
933
+ ndcg = ranking_metrics.ndcg_at_k_ties(y_truth[non_binary_target_name], y_pred, 20)
934
+ all_results[method]['NDCG@20'].append(ndcg)
935
+
936
+ if 'NDCG@50' in test_metrics:
937
+ ndcg = ranking_metrics.ndcg_at_k_ties(y_truth[non_binary_target_name], y_pred, 50)
938
+ all_results[method]['NDCG@50'].append(ndcg)
939
+
940
+ if 'precision@5' in test_metrics:
941
+ y_top_truth = (y_truth[non_binary_target_name] >= np.sort(y_truth[non_binary_target_name])[::-1][:5][-1]) * 1
942
+ y_top_pred = (y_pred >= np.sort(y_pred)[::-1][:5][-1]) * 1
943
+ all_results[method]['precision@5'].append(sklearn.metrics.precision_score(y_top_pred, y_top_truth))
944
+
945
+ if 'precision@10' in test_metrics:
946
+ y_top_truth = (y_truth[non_binary_target_name] >= np.sort(y_truth[non_binary_target_name])[::-1][:10][-1]) * 1
947
+ y_top_pred = (y_pred >= np.sort(y_pred)[::-1][:10][-1]) * 1
948
+ all_results[method]['precision@10'].append(sklearn.metrics.precision_score(y_top_pred, y_top_truth))
949
+
950
+ if 'precision@20' in test_metrics:
951
+ y_top_truth = (y_truth[non_binary_target_name] >= np.sort(y_truth[non_binary_target_name])[::-1][:20][-1]) * 1
952
+ y_top_pred = (y_pred >= np.sort(y_pred)[::-1][:20][-1]) * 1
953
+ all_results[method]['precision@20'].append(sklearn.metrics.precision_score(y_top_pred, y_top_truth))
954
+
955
+ if 'AUC' in test_metrics:
956
+ fpr_gene[gene], tpr_gene[gene], _ = sklearn.metrics.roc_curve(y_truth['thrs'], y_pred)
957
+ auc = sklearn.metrics.auc(fpr_gene[gene], tpr_gene[gene])
958
+ all_results[method]['AUC'].append(auc)
959
+
960
+ if add_extras:
961
+ fpr_all, tpr_all, _ = sklearn.metrics.roc_curve(y_truth_thresh_all, y_pred_all)
962
+ return all_results, genes, fpr_all, tpr_all, fpr_gene, tpr_gene
963
+ else:
964
+ return all_results, genes
965
+
966
+ def plot_all_metrics(metrics, gene_names, all_learn_options, save, plots=None, bottom=0.19):
967
+ pass
968
+ # num_methods = len(list(metrics.keys()))
969
+ # metrics_names = list(metrics[list(metrics.keys())[0]].keys())
970
+ # num_genes = len(gene_names)
971
+ # width = 0.9/num_methods
972
+ # ind = np.arange(num_genes)
973
+ #
974
+ # if save==True:
975
+ # first_key = list(all_learn_options.keys())[0]
976
+ # #basefile = r"..\results\V%s_trmetric%s_%s" % (all_learn_options[first_key]["V"], all_learn_options[first_key]["training_metric"], datestamp())
977
+ # basefile = r"..\results\%s" % (first_key)
978
+ #
979
+ # d = os.path.dirname(basefile)
980
+ # if not os.path.exists(d):
981
+ # os.makedirs(d)
982
+ # with open(basefile + ".plot.pickle", "wb") as f:
983
+ # pickle.dump([metrics, all_learn_options, gene_names], f)
984
+ #
985
+ # for metric in metrics_names:
986
+ # if 'global' not in metric:
987
+ # plt.figure(metric, figsize=(20, 8))
988
+ # elif plots == None or 'gene level' in plots:
989
+ # plt.figure(metric, figsize=(12, 12))
990
+ #
991
+ # boxplot_labels = []
992
+ # boxplot_arrays = {}
993
+ # boxplot_median = {}
994
+ #
995
+ # for i, method in enumerate(metrics.keys()):
996
+ # boxplot_labels.append(method)
997
+ # for metric in list(metrics[method].keys()):
998
+ #
999
+ # if 'global' in metric:
1000
+ # plt.figure(metric)
1001
+ # plt.bar([i], metrics[method][metric], 0.9, color=plt.cm.Paired(1.*i/len(list(metrics.keys()))), label=method)
1002
+ # else:
1003
+ # if plots == None or 'gene level' in plots:
1004
+ # plt.figure(metric)
1005
+ # plt.bar(ind+(i*width), metrics[method][metric], width, color=plt.cm.Paired(1.*i/len(list(metrics.keys()))), label=method)
1006
+ #
1007
+ # median_metric = np.median(metrics[method][metric])
1008
+ # print(method, metric, median_metric)
1009
+ # assert not np.isnan(median_metric), "found nan for %s, %s" % (method, metric)
1010
+ # if metric not in list(boxplot_arrays.keys()):
1011
+ # boxplot_arrays[metric] = np.array(metrics[method][metric])[:, None]
1012
+ # boxplot_median[metric] = [np.median(np.array(metrics[method][metric]))]
1013
+ # else:
1014
+ # boxplot_arrays[metric] = np.concatenate((boxplot_arrays[metric], np.array(metrics[method][metric])[:, None]), axis=1)
1015
+ # boxplot_median[metric].append(np.median(np.array(metrics[method][metric])))
1016
+ #
1017
+ #
1018
+ # for metric in metrics_names:
1019
+ # if plots == None or 'gene level' in plots:
1020
+ # ax = plt.figure(metric)
1021
+ # leg = plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
1022
+ # # leg.draggable(state=True, use_blit=True)
1023
+ # plt.ylabel(metric)
1024
+ #
1025
+ # if 'global' in metric:
1026
+ # plt.xticks(list(range(len(list(metrics.keys())))), list(metrics.keys()), rotation=70)
1027
+ # plt.grid(True, which='both')
1028
+ # plt.subplots_adjust(left = 0.05, right = 0.8)
1029
+ # else:
1030
+ # plt.xticks(ind+width, gene_names)
1031
+ # plt.grid(True, which='both')
1032
+ # plt.subplots_adjust(left = 0.05, right = 0.8)
1033
+ # if save == True:
1034
+ # plt.xticks(ind+0.5, gene_names)
1035
+ # if metric=='AUC':
1036
+ # plt.ylim([0.5, 1.0])
1037
+ # plt.savefig(basefile + "_" + metric + "_bar" + ".png")
1038
+ #
1039
+ # if (plots == None or "boxplots" in plots) and 'global' not in metric:
1040
+ # plt.figure('Boxplot %s' % metric)
1041
+ #
1042
+ # sorted_boxplot = np.argsort(boxplot_median[metric])[::-1]
1043
+ #
1044
+ # plt.boxplot(boxplot_arrays[metric][:, sorted_boxplot])
1045
+ # plt.ylabel(metric)
1046
+ # plt.xticks(list(range(1, num_methods+1)), np.array(boxplot_labels)[sorted_boxplot], rotation=70)
1047
+ # plt.subplots_adjust(top = 0.97, bottom = bottom)
1048
+ #
1049
+ # if metric == 'RMSE':
1050
+ # plt.ylim((1.0, 2.0))
1051
+ #
1052
+ # if save == True:
1053
+ # plt.savefig(basefile + "_" + metric + ".png")
1054
+
1055
+ def load_results(directory, all_results, all_learn_options, model_filter=None, append_to_key=None):
1056
+ '''
1057
+ Only load up files which contain one of the strings in model_filter in their names
1058
+ model_filter should be a list, or a string
1059
+ '''
1060
+ num_added = 0
1061
+ filelist = glob.glob(directory+'\\*.pickle')
1062
+ if filelist ==[]:
1063
+ raise Exception("found no pickle files in %s" % directory)
1064
+ else:
1065
+ print("found %d files in %s" % (len(filelist), directory))
1066
+
1067
+ for results_file in filelist:
1068
+ if 'learn_options' in results_file:
1069
+ continue
1070
+
1071
+ if model_filter != None:
1072
+ if isinstance(model_filter, list):
1073
+ in_filt = False
1074
+ for m in model_filter:
1075
+ if m in results_file:
1076
+ in_filt = True
1077
+ if not in_filt:
1078
+ print("%s not in model_filter" % (results_file))#, model_filter)
1079
+ continue
1080
+ elif model_filter not in results_file:
1081
+ continue
1082
+
1083
+ try:
1084
+ with open(results_file, 'rb') as f:
1085
+ results, learn_options = pickle.load(f)
1086
+ gene_names = None
1087
+ except:
1088
+ with open(results_file, 'rb') as f:
1089
+ # this is when I accidentally saved from the plotting routine and should not generally be needed
1090
+ results, learn_options, gene_names = pickle.load(f)
1091
+
1092
+ for k in list(results.keys()):
1093
+ if append_to_key is not None:
1094
+ k_new = k + "_" + append_to_key
1095
+ else:
1096
+ k_new = k
1097
+ assert k_new not in list(all_results.keys()), "found %s already" % k
1098
+ print("adding key %s (from file %s)" % (k_new, os.path.split(results_file)[-1]))
1099
+ all_results[k_new] = results[k]
1100
+ all_learn_options[k_new] = learn_options[k]
1101
+ num_added = num_added +1
1102
+
1103
+ if num_added==0:
1104
+ raise Exception("found no files to add from dir=%s" % directory)
1105
+
1106
+ return all_results, all_learn_options
1107
+
1108
+ def plot_cluster_results(metrics=['spearmanr', 'NDCG@5'], plots=['boxplots'], directory=r'\\fusi1\crispr2\analysis\cluster\results', results=None, learn_options=None, filter=None):
1109
+
1110
+ all_results = {}
1111
+ all_learn_options = {}
1112
+
1113
+ if results is None:
1114
+ if type(directory) == list:
1115
+ for exp_dir in directory:
1116
+ all_results, all_learn_options = load_results(exp_dir, all_results, all_learn_options, filter)
1117
+ else:
1118
+ all_results, all_learn_options = load_results(directory, all_results, all_learn_options, filter)
1119
+
1120
+ else:
1121
+ for k in list(results.keys()):
1122
+ assert k not in list(all_results.keys())
1123
+ all_results[k] = results[k]
1124
+ all_learn_options[k] = learn_options[k]
1125
+
1126
+ all_metrics, gene_names = get_all_metrics(all_results, test_metrics=metrics)
1127
+ plot_all_metrics(all_metrics, gene_names, all_learn_options, plots=plots, save=False)
1128
+
1129
+
1130
+ def ensemble_cluster_results(directory=r'\\fusi1\crispr2\analysis\cluster\results\cluster_experiment_izf_ob', ensemble_type='median', models_to_ensemble=['all']):
1131
+ all_results = {}
1132
+ all_learn_options = {}
1133
+
1134
+ for results_file in glob.glob(directory+'\\*.pickle'):
1135
+ if 'learn_options' in results_file:
1136
+ continue
1137
+
1138
+ with open(results_file, 'rb') as f:
1139
+ results, learn_options = pickle.load(f)
1140
+
1141
+ for k in list(results.keys()):
1142
+ assert k not in list(all_results.keys())
1143
+ all_results[k] = results[k]
1144
+ all_learn_options[k] = learn_options[k]
1145
+
1146
+ genes = list(all_results[list(all_results.keys())[0]][1][0][0].keys())
1147
+ models = list(all_results.keys())
1148
+
1149
+ ens_predictions = {}
1150
+ ens_truths = {}
1151
+ for g, gene in enumerate(genes):
1152
+ test_predictions = None
1153
+ cv_predictions = None
1154
+ cv_truth = None
1155
+
1156
+ prev_model_truth = None
1157
+ for i, model in enumerate(models):
1158
+ if len([m for m in models_to_ensemble if m in model]) == 0:
1159
+ continue
1160
+
1161
+ truth, predictions = all_results[model][1][0]
1162
+
1163
+ if test_predictions == None:
1164
+ test_predictions = predictions[gene][:,None]
1165
+ else:
1166
+ test_predictions = np.append(test_predictions, predictions[gene][:,None], axis=1)
1167
+
1168
+ # this is just to check that all the models are using the same ordering of
1169
+ # the ground truth and hence of the samples, as this might mess up the ensemble.
1170
+ if prev_model_truth is not None:
1171
+ assert np.all(truth[gene]['ranks'] == prev_model_truth)
1172
+ else:
1173
+ prev_model_truth = truth[gene]['ranks']
1174
+
1175
+ # take all the other genes and stack the predictions under a given model.
1176
+ cv_predictions_gene_j = np.array([])
1177
+ cv_truth_gene_j = np.array([])
1178
+ for other_gene in genes:
1179
+ if gene == other_gene:
1180
+ continue
1181
+ cv_predictions_gene_j = np.append(cv_predictions_gene_j, predictions[other_gene])
1182
+ cv_truth_gene_j = np.append(cv_truth_gene_j, truth[other_gene]['ranks'])
1183
+
1184
+ if cv_truth is None:
1185
+ cv_truth = cv_truth_gene_j.copy()[:, None]
1186
+
1187
+
1188
+ if cv_predictions is None:
1189
+ cv_predictions = cv_predictions_gene_j[:, None]
1190
+ else:
1191
+ cv_predictions = np.append(cv_predictions, cv_predictions_gene_j[:,None],
1192
+ axis=1)
1193
+
1194
+ if ensemble_type == 'majority':
1195
+ y_pred = ensembles.pairwise_majority_voting(test_predictions)
1196
+ if ensemble_type == 'median':
1197
+ y_pred = ensembles.median(test_predictions)
1198
+ if ensemble_type == 'stacking':
1199
+ y_pred = ensembles.linear_stacking(cv_truth, cv_predictions, test_predictions)
1200
+
1201
+ ens_predictions[gene] = y_pred
1202
+ ens_truths[gene] = truth[gene]
1203
+
1204
+ all_results[ensemble_type] = [None, [[ens_truths, ens_predictions]], None, None]
1205
+ all_learn_options[ensemble_type] = None
1206
+ # spearmans = []
1207
+ # for gene in ens_predictions.keys():
1208
+ # spearmans.append(sp.stats.spearmanr(ens_predictions[gene], ens_truths[gene]['raw'])[0])
1209
+ # print gene, spearmans[-1]
1210
+ # print "median: %.5f" % np.median(spearmans)
1211
+
1212
+ return all_results, all_learn_options
1213
+
1214
+ # def plot_old_vs_new_feat(results, models, fontsize=20, filename=None, print_output=False):
1215
+ #
1216
+ # model_names = []
1217
+ # for model in models:
1218
+ # if 'doench' in model:
1219
+ # model_names.append('SVM + LogReg')
1220
+ # elif 'AB_' in model:
1221
+ # model_names.append('AdaBoost DT')
1222
+ # else:
1223
+ # model_names.append(model)
1224
+ #
1225
+ # base_spearman_means = []
1226
+ # base_AUC_means = []
1227
+ # feat_spearman_means = []
1228
+ # feat_AUC_means = []
1229
+ # base_spearman_std = []
1230
+ # feat_spearman_std = []
1231
+ # base_AUC_se = []
1232
+ # feat_AUC_se = []
1233
+ #
1234
+ # for model in models:
1235
+ # metrics = get_all_metrics({model: results[model]}, test_metrics=['spearmanr', 'AUC'])[0][model]
1236
+ # metrics_feat = get_all_metrics({model + '_feat': results[model + "_feat"]}, test_metrics=['spearmanr', 'AUC'])[0][model + '_feat']
1237
+ #
1238
+ # base_spearman_means.append(np.mean(metrics['spearmanr']))
1239
+ # base_spearman_std.append(np.std(metrics['spearmanr']))
1240
+ # base_AUC_means.append(np.mean(metrics['AUC']))
1241
+ # base_AUC_se.append(np.std(metrics['AUC']))
1242
+ #
1243
+ # feat_spearman_means.append(np.mean(metrics_feat['spearmanr']))
1244
+ # feat_spearman_std.append(np.std(metrics_feat['spearmanr']))
1245
+ # feat_AUC_means.append(np.mean(metrics_feat['AUC']))
1246
+ # feat_AUC_se.append(np.std(metrics_feat['AUC']))
1247
+ #
1248
+ #
1249
+ # print("old features")
1250
+ # print("mean: " + str(base_spearman_means))
1251
+ # print("std: " + str(base_spearman_std))
1252
+ #
1253
+ # print("old + new features")
1254
+ # print("mean: " + str(feat_spearman_means))
1255
+ # print("std: " + str(feat_spearman_std))
1256
+ #
1257
+ # plt.figure()
1258
+ # ind = np.arange(len(models))
1259
+ # width = 0.4
1260
+ # plt.bar(ind, base_spearman_means, width, color='#D14B5D', yerr=base_spearman_std, ecolor='k', edgecolor='none', label='Old features')
1261
+ # plt.bar(ind+width, feat_spearman_means, width, color='#852230', yerr=feat_spearman_std, ecolor='k', edgecolor='none', label='Old + new features')
1262
+ # ax = plt.gca()
1263
+ # ax.set_ylabel('Spearman r', fontsize=fontsize)
1264
+ # ax.set_xticks(ind+width)
1265
+ # ax.set_xticklabels(model_names, fontsize=fontsize)
1266
+ # plt.legend(loc=0, fontsize=fontsize)
1267
+ # plt.yticks(fontsize=fontsize)
1268
+ # plt.ylim((0.0, 0.7))
1269
+ # remove_top_right_on_plot()
1270
+ # if filename is not None:
1271
+ # plt.savefig(filename + '_spearman.pdf')
1272
+ #
1273
+ # plt.figure()
1274
+ # ind = np.arange(len(models))
1275
+ # width = 0.4
1276
+ # plt.bar(ind, base_AUC_means, width, color='#D14B5D', yerr=base_AUC_se, ecolor='k', edgecolor='none', label='Old features')
1277
+ # plt.bar(ind+width, feat_AUC_means, width, color='#852230', yerr=feat_AUC_se, ecolor='k', edgecolor='none', label='Old + new features')
1278
+ # ax = plt.gca()
1279
+ # ax.set_ylabel('AUC', fontsize=fontsize)
1280
+ # ax.set_xticks(ind+width)
1281
+ # ax.set_xticklabels(model_names, fontsize=fontsize)
1282
+ # plt.legend(loc=0)
1283
+ # plt.ylim((0.5, 0.85))
1284
+ # plt.legend(loc=0, fontsize=fontsize)
1285
+ # plt.yticks(fontsize=fontsize)
1286
+ # remove_top_right_on_plot()
1287
+ # if filename is not None:
1288
+ # plt.savefig(filename + '_AUC.pdf')
1289
+ #
1290
+ # # plt.subplots_adjust(top = 0.97, bottom = 0.4)
1291
+
1292
+
1293
+ # def remove_top_right_on_plot(ax=None):
1294
+ # if ax==None:
1295
+ # # ax = plt.gca()
1296
+ # ax.xaxis.set_ticks_position('bottom')
1297
+ # ax.yaxis.set_ticks_position('left')
1298
+ # ax.spines['right'].set_visible(False)
1299
+ # ax.spines['top'].set_visible(False)
1300
+
1301
+
1302
+ if __name__ == '__main__':
1303
+ get_thirty_one_mer_data(); import ipdb; ipdb.set_trace()
1304
+
1305
+ # v3_v3_a_feat = 'tests/ens/'
1306
+ # v3_v3_d_feat = 'tests/ens2/'
1307
+ # # v3_v3_a_feat = r'\\fusi1\crispr2\analysis\cluster\results\cluster_experiment_flmrsw'
1308
+ # all_results, all_learn_options = {}, {}
1309
+ # all_results, all_learn_options = util.load_results(v3_v3_a_feat, all_results, all_learn_options, model_filter=None, append_to_key='feat')
1310
+ # results = dict([('AB', all_results['AB_or2_md3_lr0.10_n100_V3_on_V3_feat'])])
1311
+ # df = feature_importances(results)
1312
+ # all_results, all_learn_options = ensemble_cluster_results(directory=[v3_v3_a_feat], ensemble_type='SVM')
1313
+ # plot_cluster_results(results=all_results, learn_options=all_learn_options, metrics=['AUC', 'spearmanr'])
1314
+ # plot_cluster_results(directory=r'\\fusi1\crispr2\analysis\cluster\results')
1315
+ # all_results = ensemble_cluster_results(ensemble_type='stacking', models_to_ensemble=['L1', 'L2'])
1316
+ # all_metrics, gene_names = get_all_metrics(all_results)
1317
+ # plot_all_metrics(all_metrics, gene_names, None, save=False)
1318
+ #V = "0"
1319
+ V = "1"
1320
+ if V=="1":
1321
+ human_data = pandas.read_excel("data/V1_data.xlsx", sheetname=0, index_col=[0,1])
1322
+ mouse_data = pandas.read_excel("data/V1_data.xlsx", sheetname=1, index_col=[0,1])
1323
+ X, Y = combine_organisms()
1324
+ X.to_pickle('../data/X.pd') #sequence features (i.e. inputs to prediction)
1325
+ Y.to_pickle('../data/Y.pd') #cell-averaged ranks, plus more (i.e. possible targets for prediction)
1326
+ print("done writing to file")
1327
+ elif V =="2":
1328
+ # this is now all in predict.py
1329
+ pass
1330
+ elif V=="0":
1331
+ pass
src/utils/ui.py CHANGED
@@ -32,95 +32,3 @@ def show_error(settings, message, e):
32
  print(f"Error showing error message: {e}")
33
 
34
  exit(-1)
35
-
36
- def scale_ui(window, base_width=1920, base_height=1080, font_size=12, header_font_size=30, custom_scale_width=None, custom_scale_height=None):
37
- try:
38
- # Get the primary screen
39
- screen = QtGui.QGuiApplication.primaryScreen()
40
- screen_geometry = screen.geometry()
41
- width = screen_geometry.width()
42
- height = screen_geometry.height()
43
-
44
- # Font scaling
45
- window.centralWidget().setStyleSheet(f"font: {font_size}pt 'Arial';")
46
-
47
- if hasattr(window, 'title'):
48
- scaled_title_font_size = int(header_font_size * (width / base_width))
49
- window.title.setStyleSheet(f"font: bold {scaled_title_font_size}pt 'Arial';")
50
-
51
- # Calculate sizes
52
- scaledWidth = int((width * (custom_scale_width if custom_scale_width else 1150)) / base_width)
53
- scaledHeight = int((height * (custom_scale_height if custom_scale_height else 650)) / base_height)
54
-
55
- # Ensure minimum size
56
- window.adjustSize()
57
- currentWidth = window.size().width()
58
- currentHeight = window.size().height()
59
-
60
- if scaledHeight < currentHeight:
61
- scaledHeight = currentHeight
62
- if scaledWidth < currentWidth:
63
- scaledWidth = currentWidth
64
-
65
- # Resize in a single operation
66
- window.resize(scaledWidth, scaledHeight)
67
-
68
- except Exception as e:
69
- print(f"Error in scale_ui: {e}")
70
-
71
- def center_ui(window):
72
- try:
73
- window.repaint()
74
- QtWidgets.QApplication.processEvents()
75
-
76
- # Get the dimensions of the window
77
- width = window.width()
78
- height = window.height()
79
-
80
- # Get the primary screen
81
- screen = QtGui.QGuiApplication.primaryScreen()
82
-
83
- # Get the geometry of the screen
84
- screen_geometry = screen.geometry()
85
- centerPoint = screen_geometry.center()
86
-
87
- # Calculate new x and y coordinates
88
- x = centerPoint.x() - (width // 2)
89
- y = centerPoint.y() - (height // 2)
90
-
91
- # Set the new geometry for the window
92
- window.setGeometry(x, y, width, height)
93
- window.repaint()
94
- except Exception as e:
95
- print(f"Error centering window: {e}")
96
-
97
- def position_window(new_window, parent_window=None):
98
- # Check if the window is already visible and active
99
- if new_window.view.isVisible() and new_window.view.isActiveWindow():
100
- # If the window is already visible and active, just ensure it's in the foreground
101
- new_window.view.raise_()
102
- new_window.view.activateWindow()
103
- QtWidgets.QApplication.setActiveWindow(new_window.view)
104
- return
105
-
106
- if parent_window is None:
107
- parent_window = QtWidgets.QApplication.activeWindow()
108
-
109
- if parent_window:
110
- if hasattr(parent_window, 'last_position') and parent_window.last_position:
111
- new_window.view.move(parent_window.last_position)
112
- else:
113
- parent_geo = parent_window.geometry()
114
- new_window.view.move(parent_geo.x() + 50, parent_geo.y() + 50)
115
- else:
116
- center_ui(new_window)
117
-
118
- new_window.view.show()
119
- new_window.view.raise_()
120
- new_window.view.activateWindow()
121
- QtWidgets.QApplication.setActiveWindow(new_window.view)
122
-
123
- # Force the window to be active and in the foreground
124
- new_window.view.setWindowState(new_window.view.windowState() & ~QtCore.Qt.WindowState.WindowMinimized | QtCore.Qt.WindowState.WindowActive)
125
- new_window.view.raise_()
126
- new_window.view.activateWindow()
 
32
  print(f"Error showing error message: {e}")
33
 
34
  exit(-1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/views/AnnotationParser.py DELETED
@@ -1,429 +0,0 @@
1
- ###############################################################################
2
- # INPUTS: inputs are the annotation files to parse. Currently, only gbff is supported.
3
- # OUTPUTS: the outputs are data structures that store the parsed data
4
- ################################################################################
5
-
6
- from PyQt5 import QtWidgets
7
- import gffutils
8
- import models.GlobalSettings as GlobalSettings
9
- import os
10
- from Bio import SeqIO
11
- import traceback
12
-
13
- logger = GlobalSettings.logger
14
-
15
- class Annotation_Parser:
16
- def __init__(self):
17
- try:
18
- #variables to use
19
- self.annotationFileName = "" #this is the variable that holds the filename itself
20
- self.txtLocusTag = False
21
- self.isGff = False
22
- self.isTxt = False
23
- self.max_chrom = 0
24
-
25
- #dictionary used for finding the genes in a txt annotation file
26
- #key: locus_tag
27
- #value: List of lists
28
- # essentially its all based on locus tag. So the key is the locus tag, and its data is:
29
- # [genomic accession, int, start, end, +\-]
30
- self.reg_dict = dict()
31
-
32
- #parallel dictionary used for the txt annotaion file
33
- #key: name + symbol (space in between each word)
34
- #value: locus_tag (indexes dict)
35
- self.para_dict = dict()
36
-
37
- #list of tuples containing (chromosome/scaffold # {int}, Feature matching search criteria {SeqFeature Object})
38
- self.results_list = list()
39
-
40
- except Exception as e:
41
- logger.critical("Error initializing Annotation_Parser class.")
42
- logger.critical(e)
43
- logger.critical(traceback.format_exc())
44
- msgBox = QtWidgets.QMessageBox()
45
- msgBox.setStyleSheet("font: " + str(GlobalSettings.mainWindow.fontSize) + "pt 'Arial'")
46
- msgBox.setIcon(QtWidgets.QMessageBox.Icon.Critical)
47
- msgBox.setWindowTitle("Fatal Error")
48
- msgBox.setText("Fatal Error:\n"+str(e)+ "\n\nFor more information on this error, look at CASPER.log in the application folder.")
49
- msgBox.addButton(QtWidgets.QMessageBox.StandardButton.Close)
50
- msgBox.exec()
51
-
52
- exit(-1)
53
-
54
- ### This function takes a list of lists and flattens it into a single list. Useful when dealing with a list of lists where the nested lists only have 1 entry.
55
- def flatten_list(self,t):
56
- return [item.lower() for sublist in t for item in sublist]
57
-
58
- ### This function finds how many chromosomes are within the selcted annotation file and returns the value
59
- def get_max_chrom(self):
60
- parser = SeqIO.parse(self.annotationFileName, 'genbank') # Initialize parser (iterator) for each query
61
- for i, record in enumerate(parser):
62
- max_chrom = i+1
63
- return max_chrom
64
-
65
- def get_sequence_info(self, query):
66
- try:
67
- self.results_list.clear()
68
- parser = SeqIO.parse(self.annotationFileName, 'genbank') # Initialize parser (iterator) for each query
69
- for j,record in enumerate(parser): # Each record corresponds to a chromosome/scaffold in the FNA/FASTA file
70
- tmp = str(record.seq).find(query)
71
- if tmp != -1: # If match is found
72
- return (j+1,tmp+1,tmp+len(query)) # Chromosome number, start index, stop index
73
- else:
74
- tmp = str(record.seq.reverse_complement()).find(query) # Check the reverse complement now
75
- if tmp != -1: # If match is found
76
- return (j+1,tmp-len(query),tmp-1) # Chromosome number, start index, stop index
77
- else:
78
- continue
79
- return False
80
-
81
- except Exception as e:
82
- logger.critical("Error in get_sequence_info() in annotation parser.")
83
- logger.critical(e)
84
- logger.critical(traceback.format_exc())
85
- msgBox = QtWidgets.QMessageBox()
86
- msgBox.setStyleSheet("font: " + str(GlobalSettings.mainWindow.fontSize) + "pt 'Arial'")
87
- msgBox.setIcon(QtWidgets.QMessageBox.Icon.Critical)
88
- msgBox.setWindowTitle("Fatal Error")
89
- msgBox.setText("Fatal Error:\n"+str(e)+ "\n\nFor more information on this error, look at CASPER.log in the application folder.")
90
- msgBox.addButton(QtWidgets.QMessageBox.StandardButton.Close)
91
- msgBox.exec()
92
-
93
- exit(-1)
94
-
95
- ### The workhorse function of AnnotationParser, this searches the annotation file for the user's search and returns features matching the description.
96
- def genbank_search(self, queries, same_search):
97
- index_number = 0
98
- try:
99
- if same_search: # If searching for the same thing, just return the results from last time
100
- return self.results_list
101
- else:
102
- self.results_list.clear()
103
- for i, query in enumerate(queries):
104
- parser = SeqIO.parse(self.annotationFileName, 'genbank') # Initialize parser (iterator) for each query
105
- for j,record in enumerate(parser): # Each record corresponds to a chromosome/scaffold in the FNA/FASTA file
106
- if i == 0:
107
- index_number += 1
108
- for feature in record.features: # Each feature corresponds to a gene, tRNA, rep_origin, etc. in the given record (chromosome/scaffold)
109
- if "translation" in feature.qualifiers:
110
- if query.lower() in " ".join(self.flatten_list(feature.qualifiers.values())[:-1]) and feature.type != "source" and feature.type != "gene": # If search matches the feature's qualifiers somewhere, save it
111
- self.results_list.append((j+1,feature))
112
- else: # If search not in the feature's qualifiers, move to the next feature
113
- continue
114
- else:
115
- if query.lower() in " ".join(self.flatten_list(feature.qualifiers.values())) and feature.type != "source" and feature.type != "gene": # If search matches the feature's qualifiers somewhere, save it
116
- self.results_list.append((j+1,feature))
117
- else: # If search not in the feature's qualifiers, move to the next feature
118
- continue
119
- self.max_chrom = index_number # Counts the number of chromosomes/scaffolds in the organism (only do this once, even if there are multiple queries)
120
- else:
121
- for feature in record.features:
122
- if "translation" in feature.qualifiers:
123
- if query.lower() in " ".join(self.flatten_list(feature.qualifiers.values())[:-1]) and feature.type != "source" and feature.type != "gene": # If search matches the feature's qualifiers somewhere, save it
124
- self.results_list.append((j+1,feature))
125
- else: # If search not in the feature's qualifiers, move to the next feature
126
- continue
127
- else:
128
- if query.lower() in " ".join(self.flatten_list(feature.qualifiers.values())) and feature.type != "source" and feature.type != "gene": # If search matches the feature's qualifiers somewhere, save it
129
- self.results_list.append((j+1,feature))
130
- else: # If search not in the feature's qualifiers, move to the next feature
131
- continue
132
- return self.results_list
133
-
134
- except Exception as e:
135
- logger.critical("Error in genbank_search() in annotation parser.")
136
- logger.critical(e)
137
- logger.critical(traceback.format_exc())
138
- msgBox = QtWidgets.QMessageBox()
139
- msgBox.setStyleSheet("font: " + str(GlobalSettings.mainWindow.fontSize) + "pt 'Arial'")
140
- msgBox.setIcon(QtWidgets.QMessageBox.Icon.Critical)
141
- msgBox.setWindowTitle("Fatal Error")
142
- msgBox.setText("Fatal Error:\n"+str(e)+ "\n\nFor more information on this error, look at CASPER.log in the application folder.")
143
- msgBox.addButton(QtWidgets.QMessageBox.StandardButton.Close)
144
- msgBox.exec()
145
-
146
- exit(-1)
147
-
148
-
149
-
150
-
151
- # This function parses gff files and stores them in a dictionary
152
- # It also creates a parallel dictionary to use in searching
153
- # Precondition: ONLY TO BE USED WITH GFF FILES
154
- def gff_parse(self):
155
- try:
156
- self.reg_dict.clear()
157
- self.para_dict.clear()
158
- prevFirstIndex = ""
159
- indexNumber = 1
160
- fileStream = open(self.annotationFileName)
161
- data_base_file_name = GlobalSettings.CSPR_DB + "/" + "gff_database.db"
162
-
163
- # temp list will be the following each time it is put into the dictionary:
164
- # [Sequence ID (genomic accession or scaffold), the index number itself, the feature type (cds, gene, mrna), the start(-1), end, and the strand]
165
- tempList = list()
166
- currentLocusTag = ""
167
- para_dict_key_string = ""
168
-
169
- # initialize the data base (this is what parses it for me)
170
- print("Intializing the data base")
171
- db = gffutils.create_db(self.annotationFileName, dbfn=data_base_file_name, force=True, keep_order=True,
172
- merge_strategy='merge', sort_attribute_values=True)
173
- print("Finished intializing")
174
-
175
- # call the feature version of that data base now
176
- db = gffutils.FeatureDB(data_base_file_name, keep_order=True)
177
-
178
- # now we go through that data base and get the data we want
179
- for feature in db.all_features(limit=None, strand=None, featuretype=None, order_by=None, reverse=False,
180
- completely_within=False):
181
- # if the genomic accession/scaffold/chromseome changes, update the indexNumber
182
- if prevFirstIndex != feature.seqid and prevFirstIndex != "":
183
- indexNumber += 1
184
- # if we find a new gene, update the locus_tag/name
185
- if feature.featuretype == "gene" or feature.featuretype == 'pseudogene':
186
-
187
- # check and see if locus tag is in the attributes, go on the Name if locus_tag is not in there
188
- if 'locus_tag' in feature.attributes:
189
- currentLocusTag = feature.attributes['locus_tag'][0]
190
- else:
191
- currentLocusTag = feature.attributes["Name"][0]
192
-
193
- # once the locus tag changes, append it to the para_dict
194
- if para_dict_key_string != "":
195
- if para_dict_key_string not in self.para_dict:
196
- self.para_dict[para_dict_key_string] = list()
197
- self.para_dict[para_dict_key_string].append(currentLocusTag)
198
- else:
199
- if currentLocusTag not in self.para_dict[para_dict_key_string]:
200
- self.para_dict[para_dict_key_string].append(currentLocusTag)
201
- para_dict_key_string = ""
202
-
203
- tempList = [currentLocusTag, indexNumber, feature.featuretype, feature.start - 1, feature.end,
204
- feature.strand]
205
-
206
- # insert that locus tag/name into the dictionary
207
- if currentLocusTag not in self.reg_dict:
208
- self.reg_dict[currentLocusTag] = []
209
- self.reg_dict[currentLocusTag].append(tempList)
210
- elif currentLocusTag in self.reg_dict:
211
- self.reg_dict[currentLocusTag].append(tempList)
212
-
213
- # go through each of this child's children
214
- for child in db.children(feature.id, level=None, featuretype=None, order_by=None, reverse=False,
215
- limit=None, completely_within=False):
216
- tempList = [currentLocusTag, indexNumber, child.featuretype, child.start - 1, child.end, child.strand]
217
-
218
- # only insert it if it hasn't been inserted before
219
- if tempList not in self.reg_dict[currentLocusTag]:
220
- self.reg_dict[currentLocusTag].append(tempList)
221
-
222
- # now go through the other ones which are not region
223
- elif feature.featuretype != "region" and feature.featuretype != "telomere" and feature.featuretype != "origin_of_replication":
224
- tempList = [currentLocusTag, indexNumber, feature.featuretype, feature.start - 1, feature.end,
225
- feature.strand]
226
-
227
- # only insert if it hasn't been inserted before
228
- if tempList not in self.reg_dict[currentLocusTag]:
229
- self.reg_dict[currentLocusTag].append(tempList)
230
-
231
- # now same as above, go through the children again
232
- for child in db.children(feature.id, level=None, featuretype=None, order_by=None, reverse=False,
233
- limit=None, completely_within=False):
234
- tempList = [currentLocusTag, indexNumber, child.featuretype, child.start - 1, child.end,
235
- child.strand]
236
-
237
- if tempList not in self.reg_dict[currentLocusTag]:
238
- self.reg_dict[currentLocusTag].append(tempList)
239
-
240
- # now we need to get the para_dict up and running
241
- # get the stuff out of the product part
242
- if 'product' in feature.attributes and feature.featuretype == "CDS":
243
- if para_dict_key_string == "":
244
- para_dict_key_string = feature.attributes['product'][0]
245
- else:
246
- para_dict_key_string = para_dict_key_string + ";" + feature.attributes['product'][0]
247
- # get the stuff out of the Note part
248
- if 'Note' in feature.attributes:
249
- if para_dict_key_string == "":
250
- para_dict_key_string = feature.attributes['Note'][0]
251
- else:
252
- para_dict_key_string = para_dict_key_string + ";" + feature.attributes['Note'][0]
253
-
254
- prevFirstIndex = feature.seqid
255
- self.max_chrom = indexNumber
256
- except Exception as e:
257
- logger.critical("Error in gff_parse() in annotation parser.")
258
- logger.critical(e)
259
- logger.critical(traceback.format_exc())
260
- msgBox = QtWidgets.QMessageBox()
261
- msgBox.setStyleSheet("font: " + str(GlobalSettings.mainWindow.fontSize) + "pt 'Arial'")
262
- msgBox.setIcon(QtWidgets.QMessageBox.Icon.Critical)
263
- msgBox.setWindowTitle("Fatal Error")
264
- msgBox.setText("Fatal Error:\n"+str(e)+ "\n\nFor more information on this error, look at CASPER.log in the application folder.")
265
- msgBox.addButton(QtWidgets.QMessageBox.StandardButton.Close)
266
- msgBox.exec()
267
-
268
- exit(-1)
269
-
270
- # This function parses txt files and stores them in a dictionary
271
- # It also creates a parallel dictionary to use in searching
272
- # Precondition: ONLY TO BE USED WITH TXT FILES
273
- def txt_parse(self):
274
- try:
275
- self.reg_dict.clear()
276
- prevGenAccession = ""
277
- indexNumber = 1
278
- fileStream = open(self.annotationFileName)
279
- buffer = ""
280
- currentLocusTag = ""
281
- para_dict_key_string = ""
282
-
283
- while(True): # this loop breaks out when buffer string is empty
284
- buffer = fileStream.readline()
285
-
286
- if(buffer.startswith("#")): #skip lines that start with #
287
- continue
288
- else:
289
- if(len(buffer) <= 2): # break out once we reach the end of the file
290
- break
291
-
292
- splitLine = buffer[:-1].split("\t")
293
-
294
- # increment indexNumber when genomic access changes
295
- if prevGenAccession != splitLine[6] and prevGenAccession != "":
296
- indexNumber += 1
297
-
298
- # if parsing on locus_tag, use the locus_tag as the key for the dict
299
- if self.txtLocusTag:
300
- currentLocusTag = splitLine[16]
301
- values = [currentLocusTag, indexNumber, splitLine[0], int(splitLine[7]) - 1, int(splitLine[8]), splitLine[9]]
302
-
303
- if currentLocusTag not in self.reg_dict:
304
- self.reg_dict[currentLocusTag] = [values]
305
- elif currentLocusTag in self.reg_dict:
306
- self.reg_dict[currentLocusTag].append(values)
307
-
308
- # if no locus_tag, parse on product_accession, use the product_accession as the key for the dict
309
- elif not self.txtLocusTag:
310
- currentLocusTag = splitLine[10]
311
- values = [currentLocusTag, indexNumber, splitLine[0], int(splitLine[7]) - 1, int(splitLine[8]), splitLine[9]]
312
-
313
- if currentLocusTag not in self.reg_dict:
314
- self.reg_dict[currentLocusTag] = [values]
315
- elif currentLocusTag in self.reg_dict:
316
- self.reg_dict[currentLocusTag].append(values)
317
-
318
- if splitLine[13] != '':
319
- if para_dict_key_string == '':
320
- para_dict_key_string = splitLine[13] + ';'
321
- else:
322
- para_dict_key_string = para_dict_key_string + splitLine[13] + ';'
323
-
324
- # leaving this in for now, it's related accession
325
- #if splitLine[12] != '':
326
- # if para_dict_key_string == '':
327
- # para_dict_key_string = splitLine[12] + ';'
328
- # else:
329
- # para_dict_key_string = para_dict_key_string + splitLine[12] + ';'
330
-
331
-
332
- if splitLine[14] != '':
333
- if para_dict_key_string == '':
334
- para_dict_key_string = splitLine[14] + ';'
335
- else:
336
- para_dict_key_string = para_dict_key_string + splitLine[14] + ';'
337
-
338
- para_dict_key_string = para_dict_key_string.replace(',', '')
339
- # set the parallel dictionary's key string
340
- #para_dict_key_string = splitLine[13] + ";" + splitLine[12] + ";" + splitLine[14]
341
-
342
- # if the current line we're on has the data we want for the parellel dictionary, store it
343
- if len(para_dict_key_string) > 3:
344
- if para_dict_key_string[len(para_dict_key_string) - 1] == ';':
345
- para_dict_key_string = para_dict_key_string[0:len(para_dict_key_string) - 1]
346
-
347
- if para_dict_key_string not in self.para_dict: # make a new input into the dict
348
- self.para_dict[para_dict_key_string] = [currentLocusTag]
349
- elif para_dict_key_string in self.para_dict:
350
- if currentLocusTag not in self.para_dict[para_dict_key_string]:
351
- # only append it to the dict's list if it isn't currently in there
352
- self.para_dict[para_dict_key_string].append(currentLocusTag)
353
-
354
- para_dict_key_string = ""
355
- prevGenAccession = splitLine[6]
356
- self.max_chrom = indexNumber
357
- except Exception as e:
358
- logger.critical("Error in txt_parse() in annotation parser.")
359
- logger.critical(e)
360
- logger.critical(traceback.format_exc())
361
- msgBox = QtWidgets.QMessageBox()
362
- msgBox.setStyleSheet("font: " + str(GlobalSettings.mainWindow.fontSize) + "pt 'Arial'")
363
- msgBox.setIcon(QtWidgets.QMessageBox.Icon.Critical)
364
- msgBox.setWindowTitle("Fatal Error")
365
- msgBox.setText("Fatal Error:\n"+str(e)+ "\n\nFor more information on this error, look at CASPER.log in the application folder.")
366
- msgBox.addButton(QtWidgets.QMessageBox.StandardButton.Close)
367
- msgBox.exec()
368
-
369
- exit(-1)
370
-
371
- # This function checks to see which file we are parsing
372
- # It also checks whether to parse based on locus_tag or product accession (txt files only)
373
- # Then it calls the respective parser functions used
374
- def find_which_file_version(self):
375
- try:
376
- if self.annotationFileName == "" or GlobalSettings.mainWindow.annotation_files.currentText() == "None":
377
- return -1
378
- if "gff" in self.annotationFileName:
379
- ### gff file support currently deprecated
380
- """
381
- self.isGff = True
382
- self.gff_parse()
383
- """
384
- print("Error: Wrong annotation file format")
385
- return -1
386
-
387
- elif "feature_table" in self.annotationFileName:
388
- ### feature table file support currently deprecated
389
- # now that we know it's a txt file and not a gff, check and see if we will be parsing by locus tag or
390
- # product accession
391
- """
392
- fileStream = open(self.annotationFileName)
393
-
394
- #skip all of the lines that start with #
395
- buf = fileStream.readline()
396
- while buf.startswith("#"):
397
- buf = fileStream.readline()
398
-
399
- # split it and see if the locus tag spot has data in it
400
- split = buf.split("\t")
401
- if split[16] != "": # if it does, we are parsing based on locus_tag
402
- self.txtLocusTag = True
403
- elif split[16] == "": # if not, we are parsing based on product accession
404
- self.txtLocusTag = False
405
- fileStream.close()
406
- self.isTxt = True
407
- self.txt_parse()
408
- """
409
- print("Error: Wrong annotation file format")
410
- return -1
411
- elif "gbff" or "gbk" in self.annotationFileName:
412
- return "gbff"
413
- # return -1 to throw the error window in main
414
- else:
415
- return -1
416
- except Exception as e:
417
- logger.critical("Error in find_which_file_version() in annotation parser.")
418
- logger.critical(e)
419
- logger.critical(traceback.format_exc())
420
- msgBox = QtWidgets.QMessageBox()
421
- msgBox.setStyleSheet("font: " + str(GlobalSettings.mainWindow.fontSize) + "pt 'Arial'")
422
- msgBox.setIcon(QtWidgets.QMessageBox.Icon.Critical)
423
- msgBox.setWindowTitle("Fatal Error")
424
- msgBox.setText("Fatal Error:\n"+str(e)+ "\n\nFor more information on this error, look at CASPER.log in the application folder.")
425
- msgBox.addButton(QtWidgets.QMessageBox.StandardButton.Close)
426
- msgBox.exec()
427
-
428
- exit(-1)
429
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/views/CMainWindow.py DELETED
@@ -1,987 +0,0 @@
1
- import platform
2
- # import controllers.ncbi as ncbi
3
- import os
4
- # from utils.Algorithms import get_table_headers
5
- # from models.CSPRparser import CSPRparser
6
- import glob
7
- import models.GlobalSettings as GlobalSettings
8
- from PyQt6 import QtWidgets, QtGui, QtCore, uic, QtGui
9
- from utils.ui import scale_ui, center_ui, show_message, show_error
10
- # from views.annotation_functions import *
11
- # from views.AnnotationParser import Annotation_Parser
12
- # from views.AnnotationWindow import AnnotationWindow
13
- # import views.genomeBrowser as genomeBrowser
14
- # from views.NewGenome import NewGenome
15
- # from views.NewEndonuclease import NewEndonuclease
16
- # from controllers.CoTargeting import CoTargeting
17
- # from views.generateLib import genLibrary
18
- # from controllers.Results import Results
19
- # from views.export_tool import export_tool
20
- # from views.closingWin import closingWindow
21
- # from utils.web import ncbi_page, repo_page, ncbi_blast_page
22
- # from controllers.populate_fna_files import PopulateFNAFiles
23
-
24
- # logger = GlobalSettings.logger
25
-
26
- fontSize = 12
27
-
28
- class CMainWindow(QtWidgets.QMainWindow):
29
- def __init__(self, settings):
30
- try:
31
- super(CMainWindow, self).__init__()
32
- # uic.loadUi(os.path.join(self.settings.get_ui_dir(), 'startupCASPER.ui'), self)
33
- # uic.loadUi(GlobalSettings.appdir + 'ui/CASPER_main.ui', self)
34
- # print("path: ", GlobalSettings.appdir + 'ui/CASPER_main_copy_2.ui')
35
- # uic.loadUi(GlobalSettings.appdir + 'ui/CASPER_main_copy_2.ui', self)
36
- self.settings = settings
37
- print("path: ", os.path.join(self.settings.get_ui_dir(), 'CASPER_main.ui'))
38
- uic.loadUi(os.path.join(self.settings.get_ui_dir(), 'CASPER_main.ui'), self)
39
- self.setWindowTitle("CASPER")
40
- self.setWindowIcon(QtGui.QIcon(os.path.join(self.settings.get_assets_dir(), "cas9image.ico")))
41
-
42
- # self.dbpath = ""
43
- # self.inputstring = "" # This is the search string
44
- # # self.info_path = settings.get_app_dir()
45
- # # info_path = settings.get_app_dir()
46
- # self.anno_name = ""
47
- # self.endo_name = ""
48
- # self.fontSize = 12
49
- # self.org = ""
50
- # self.TNumbers = {} # the T numbers from a kegg search
51
- # self.orgcodes = {} # Stores the Kegg organism code by the format {full name : organism code}
52
- # self.gene_list = {} # list of genes (no ides what they pertain to
53
- # self.searches = {}
54
- # self.checkBoxes = []
55
- # self.genlib_list = [] # This list stores selected SeqFeatures from annotation window
56
- # self.checked_info = {}
57
- # self.check_ntseq_info = {} # the ntsequences that go along with the checked_info
58
- # self.annotation_parser = Annotation_Parser()
59
- # self.link_list = list() # the list of the downloadable links from the NCBI search
60
- # self.organismDict = dict() # the dictionary for the links to download. Key is the description of the organism, value is the ID that can be found in link_list
61
- # self.results_list = list()
62
- # self.organismData = list()
63
- # self.ncbi = ncbi.NCBI_search_tool()
64
-
65
- # groupbox_style = """
66
- # QGroupBox:title{subcontrol-origin: margin;
67
- # left: 10px;
68
- # padding: 0 5px 0 5px;}
69
- # QGroupBox#Step1{border: 2px solid rgb(111,181,110);
70
- # border-radius: 9px;
71
- # margin-top: 10px;
72
- # font: bold 14pt 'Arial';}
73
- # """
74
-
75
- # self.Step1.setStyleSheet(groupbox_style)
76
- # self.Step2.setStyleSheet(groupbox_style.replace("Step1", "Step2"))
77
- # self.Step3.setStyleSheet(groupbox_style.replace("Step1", "Step3"))
78
- # self.CASPER_Navigation.setStyleSheet(groupbox_style.replace("Step1", "CASPER_Navigation").replace("solid","dashed").replace("rgb(111,181,110)","rgb(88,89,91)"))
79
-
80
- # self.setWindowIcon(QtGui.QIcon(GlobalSettings.appdir + "cas9image.ico"))
81
- # self.pushButton_FindTargets.clicked.connect(self.gather_settings)
82
- # self.pushButton_ViewTargets.clicked.connect(self.view_results)
83
- # self.pushButton_ViewTargets.setEnabled(False)
84
- # self.GenerateLibrary.setEnabled(False)
85
- # self.radioButton_Gene.clicked.connect(self.toggle_annotation)
86
- # self.radioButton_Position.clicked.connect(self.toggle_annotation)
87
-
88
- """ Connect functions to buttons """
89
- # self.newGenome_button.clicked.connect(self.launch_newGenome) # Connect launch function to New Genome
90
- # self.newEndo_button.clicked.connect(self.launch_newEndonuclease) # Connect launch function to New Endonuclease
91
- # self.multitargeting_button.clicked.connect(self.changeto_multitargeting) # Connect launch function to Multitargeting
92
- # self.populationAnalysis_button.clicked.connect(self.changeto_population_Analysis) # Connect launch function to PA
93
- # self.GenerateLibrary.clicked.connect(self.prep_genlib)
94
- # self.combineFiles_button.clicked.connect(self.launch_populate_fna_files)
95
-
96
- """ Connect functions to actions (menu bar) """
97
- # self.actionOpen_Genome_Browser.triggered.connect(self.launch_newGenomeBrowser)
98
- # self.actionExit.triggered.connect(self.close_app)
99
- # self.visit_repo.triggered.connect(repo_page)
100
- # self.actionChange_Directory.triggered.connect(self.change_directory)
101
- # self.actionNCBI.triggered.connect(ncbi_page)
102
- # self.actionCasper2.triggered.connect(self.open_casper2_web_page)
103
- # self.actionNCBI_BLAST.triggered.connect(ncbi_blast_page)
104
-
105
-
106
-
107
- # self.progressBar.setMinimum(0)
108
- # self.progressBar.setMaximum(100)
109
- # self.progressBar.reset()
110
- # self.Annotation_Window = AnnotationWindow(info_path)
111
- # self.geneEntryField.setPlaceholderText("Example Inputs: \n\n"
112
- # "Option 1: Feature (ID, Locus Tag, or Name)\n"
113
- # "Example: 854068/YOL086C/ADH1 for S. cerevisiae alcohol dehydrogenase 1\n\n"
114
- # "Option 2: Position (chromosome,start,stop)\n"
115
- # "Example: 1,1,1000 for targeting chromosome 1, base pairs 1 to 1000\n\n"
116
- # "Option 3: Sequence (must be within the selected organism)\n"
117
- # "Example: Any nucleotide sequence between 100 and 10,000 base pairs.\n\n"
118
- # "*Note: to multiplex, separate multiple queries by new lines*\n"
119
- # "Example:\n"
120
- # "1,1,1000\n"
121
- # "5,1,500\n"
122
- # "etc.")
123
-
124
- # show functionalities on window
125
- self.populate_fna_files = None
126
- self._new_genome = None
127
- # self.newEndonuclease = NewEndonuclease()
128
- # self.CoTargeting = CoTargeting(info_path)
129
- # self.Results = Results()
130
- # self.export_tool_window = export_tool()
131
- # self.genLib = genLibrary()
132
- # self.myClosingWindow = closingWindow()
133
- # self.genomebrowser = genomeBrowser.genomebrowser()
134
- # self.launch_ncbi_button.clicked.connect(self.launch_ncbi)
135
-
136
- # self.first_show = True
137
- scale_ui(self, custom_scale_width=1150, custom_scale_height=650)
138
- # self.show()
139
- # self.load_dropdown_data()
140
- print("MainWindow initialized")
141
- except Exception as e:
142
- show_error("Error in __init__() in main", e)
143
-
144
- # def get_populate_fna_files(self):
145
- # if self.populate_fna_files is None:
146
- # self.populate_fna_files = PopulateFNAFiles(GlobalSettings.GlobalSettings1(GlobalSettings.appdir))
147
- # return self.populate_fna_files
148
-
149
- # def launch_populate_fna_files(self):
150
- # self.get_populate_fna_files().show() # Ensure the window is shown
151
-
152
- # this function prepares everything for the generate library function
153
- # it is very similar to the gather settings, how ever it stores the data instead of calling the Annotation Window class
154
- # it moves the data onto the generateLib function, and then opens that window
155
- # def prep_genlib(self):
156
- # # make sure the user actually inputs something
157
- # try:
158
- # inputstring = str(self.geneEntryField.toPlainText())
159
- # if (inputstring.startswith("Example Inputs:") or inputstring == ""):
160
- # show_message(
161
- # fontSize=12,
162
- # icon=QtWidgets.QMessageBox.Icon.Critical,
163
- # title="Error",
164
- # message="No gene has been entered. Please enter a gene.",
165
- # button=QtWidgets.QMessageBox.StandardButton.Ok
166
- # )
167
- # return
168
- # else:
169
- # # standardize the input
170
- # inputstring = inputstring.lower()
171
- # found_matches_bool = True
172
- # # call the respective function
173
- # self.progressBar.setValue(10)
174
- # if self.radioButton_Gene.isChecked():
175
- # if len(self.genlib_list) > 0:
176
- # found_matches_bool = True
177
- # else:
178
- # found_matches_bool = False
179
- # elif self.radioButton_Position.isChecked() or self.radioButton_Sequence.isChecked():
180
- # show_message(
181
- # fontSize=12,
182
- # icon=QtWidgets.QMessageBox.Icon.Critical,
183
- # title="Error",
184
- # message="Generate Library can only work with feature searches.",
185
- # button=QtWidgets.QMessageBox.StandardButton.Ok
186
- # )
187
- # return
188
- # """
189
- # elif self.radioButton_Position.isChecked():
190
- # pinput = inputstring.split(';')
191
- # found_matches_bool = self.run_results("position", pinput,openAnnoWindow=False)
192
- # elif self.radioButton_Sequence.isChecked():
193
- # sinput = inputstring
194
- # found_matches_bool = self.run_results("sequence", sinput, openAnnoWindow=False)
195
- # """
196
- # # if matches are found
197
- # if found_matches_bool == True:
198
- # # get the cspr file name
199
- # cspr_file = self.organisms_to_files[self.orgChoice.currentText()][self.endoChoice.currentText()][0]
200
- # if platform.system() == 'Windows':
201
- # cspr_file = GlobalSettings.CSPR_DB + '\\' + cspr_file
202
- # else:
203
- # cspr_file = GlobalSettings.CSPR_DB + '/' + cspr_file
204
- # kegg_non = 'non_kegg'
205
-
206
- # # launch generateLib
207
- # self.progressBar.setValue(100)
208
-
209
- # # calculate the total number of matches found
210
- # tempSum = len(self.genlib_list)
211
-
212
- # # warn the user if the number is greater than 50
213
- # if tempSum > 50:
214
- # msgBox = QtWidgets.QMessageBox()
215
- # msgBox.setStyleSheet("font: " + str(self.fontSize) + "pt 'Arial'")
216
- # msgBox.setIcon(QtWidgets.QMessageBox.Icon.Question)
217
- # msgBox.setWindowTitle("Many Matches Found")
218
- # msgBox.setText("More than 50 matches have been found. Continuing could cause a slow down...\n\n Do you wish to continue?")
219
- # msgBox.addButton(QtWidgets.QMessageBox.StandardButton.Yes)
220
- # msgBox.addButton(QtWidgets.QMessageBox.StandardButton.No)
221
- # msgBox.exec()
222
-
223
- # if (msgBox.result() == QtWidgets.QMessageBox.No):
224
- # self.searches.clear()
225
- # self.progressBar.setValue(0)
226
- # return -2
227
-
228
- # self.genLib.launch(self.genlib_list,cspr_file, kegg_non)
229
- # else:
230
- # self.progressBar.setValue(0)
231
- # except Exception as e:
232
- # show_error("Error in prep_genlib() in main", e)
233
-
234
- # # Function for collecting the settings from the input field and transferring them to run_results
235
- # def gather_settings(self):
236
- # try:
237
- # ### If user searches multiple times for the same thing, this avoids re-searching the entire annotation file
238
- # check_org = self.orgChoice.currentText().lower()
239
- # check_endo = self.endoChoice.currentText().lower()
240
- # check_anno_name = self.annotation_files.currentText().lower()
241
- # check_input = str(self.geneEntryField.toPlainText()).lower()
242
- # if (check_input == self.inputstring and check_org == self.org and check_anno_name == self.anno_name and check_endo == self.endo_name):
243
- # same_search = True
244
- # else:
245
- # self.org = check_org
246
- # self.anno_name = check_anno_name
247
- # self.inputstring = check_input
248
- # self.endo_name = check_endo
249
- # same_search = False
250
-
251
- # # Error check: make sure the user actually inputs something
252
- # if (self.inputstring.startswith("Example Inputs:") or self.inputstring == ""):
253
- # show_message(
254
- # fontSize=12,
255
- # icon=QtWidgets.QMessageBox.Icon.Critical,
256
- # title="Error",
257
- # message="No feature has been searched for. Please enter a search.",
258
- # button=QtWidgets.QMessageBox.StandardButton.Ok
259
- # )
260
- # return
261
- # else:
262
-
263
- # ### Remove additional scoring columns if necessary
264
- # header = get_table_headers(self.Results.targetTable) # Returns headers of the target table in View Targets window
265
- # col_indices = [header.index(x) for x in GlobalSettings.algorithms if x in header] # Returns the index(es) of the alternative scoring column(s) in the target table of View Targets window
266
- # if len(col_indices) > 0: # If alternative scoring has been done
267
- # for i in col_indices:
268
- # self.Results.targetTable.removeColumn(i)
269
- # self.Results.targetTable.resizeColumnsToContents()
270
-
271
- # self.progressBar.setValue(10)
272
- # if self.radioButton_Gene.isChecked():
273
- # ginput = [x.strip() for x in self.inputstring.split('\n')] # Split search based on newline character and remove deadspace
274
- # self.run_results("feature", ginput, same_search)
275
- # elif self.radioButton_Position.isChecked():
276
- # pinput = [x.strip() for x in self.inputstring.split('\n')] # Split search based on newline character and remove deadspace
277
- # self.run_results("position", pinput, same_search)
278
- # elif self.radioButton_Sequence.isChecked():
279
- # sinput = self.inputstring
280
- # self.run_results("sequence", sinput, same_search)
281
- # except Exception as e:
282
- # show_error("Error in gather_settings() in main", e)
283
-
284
- # # ---- Following functions are for running the auxillary algorithms and windows ---- #
285
- # # this function is parses the annotation file given, and then goes through and goes onto results
286
- # # it will call other versions of collect_table_data and fill_table that work with these file types
287
- # # this function should work with the any type of annotation file, besides kegg.
288
- # # this assumes that the parsers all store the data the same way, which gff and feature table do
289
- # # please make sure the genbank parser stores the data in the same way
290
- # # so far the gff files seems to all be different. Need to think about how we want to parse it
291
- # def run_results_own_ncbi_file(self, inputstring, fileName, same_search, openAnnoWindow=True):
292
- # try:
293
- # self.set_progress(35)
294
- # self.results_list = self.annotation_parser.genbank_search(inputstring, same_search)
295
-
296
- # cspr_file = self.organisms_to_files[self.orgChoice.currentText()][self.endoChoice.currentText()][0]
297
- # cspr_file = os.path.join(GlobalSettings.CSPR_DB, cspr_file)
298
-
299
- # own_cspr_parser = CSPRparser(cspr_file)
300
- # own_cspr_parser.read_first_lines()
301
- # if len(own_cspr_parser.karystatsList) != self.annotation_parser.max_chrom:
302
- # show_message(
303
- # fontSize=12,
304
- # icon=QtWidgets.QMessageBox.Icon.Warning,
305
- # title="Warning:",
306
- # message="The number of chromosomes do not match. This could cause errors.",
307
- # button=QtWidgets.QMessageBox.StandardButton.Ok
308
- # )
309
- # self.set_progress(60)
310
-
311
- # self.searches.clear()
312
-
313
- # self.set_progress(75)
314
- # if not self.results_list:
315
- # show_message(
316
- # fontSize=12,
317
- # icon=QtWidgets.QMessageBox.Icon.Critical,
318
- # title="No Matches Found",
319
- # message="No matches found with that search, please try again.",
320
- # button=QtWidgets.QMessageBox.StandardButton.Ok
321
- # )
322
- # self.set_progress(0)
323
- # return False if not openAnnoWindow else None
324
-
325
- # self.set_progress(80)
326
-
327
- # return self.Annotation_Window.fill_table_nonKegg(self, self.results_list) if openAnnoWindow else True
328
- # except Exception as e:
329
- # show_error(f"Error in run_results_own_ncbi_file() in main.", e)
330
-
331
- # def set_progress(self, value):
332
- # self.progressBar.setValue(value)
333
-
334
- # def run_results(self, inputtype, inputstring, same_search, openAnnoWindow=True):
335
- # try:
336
- # file_name = self.annotation_files.currentText()
337
- # for file in glob.glob(GlobalSettings.CSPR_DB + "/**/*.gb*", recursive=True):
338
- # if file_name in file:
339
- # self.annotation_parser.annotationFileName = file
340
- # break
341
- # self.Results.annotation_path = self.annotation_parser.annotationFileName
342
-
343
- # progvalue = 15
344
- # self.searches = {}
345
- # self.gene_list = {}
346
- # self.progressBar.setValue(progvalue)
347
-
348
- # try:
349
- # self.Results.endonucleaseBox.currentIndexChanged.disconnect()
350
- # except Exception as e:
351
- # pass
352
- # # set Results endo combo box
353
- # self.Results.endonucleaseBox.clear()
354
-
355
- # # set the results window endoChoice box menu
356
- # # set the mainWindow's endoChoice first, and then loop through and set the rest of them
357
- # self.Results.endonucleaseBox.addItem(self.endoChoice.currentText())
358
- # for item in self.organisms_to_endos[str(self.orgChoice.currentText())]:
359
- # if item != self.Results.endonucleaseBox.currentText():
360
- # self.Results.endonucleaseBox.addItem(item)
361
-
362
- # self.Results.endonucleaseBox.currentIndexChanged.connect(self.Results.changeEndonuclease)
363
- # self.Results.get_endo_data()
364
-
365
- # # self.Results.change_start_end_button.setEnabled(False)
366
- # self.Results.displayGeneViewer.setChecked(0)
367
-
368
- # if inputtype == "feature":
369
- # fileType = self.annotation_parser.find_which_file_version()
370
-
371
- # # if the parser retuns the 'wrong file type' error
372
- # if fileType == -1:
373
- # show_message(
374
- # fontSize=12,
375
- # icon=QtWidgets.QMessageBox.Icon.Critical,
376
- # title="Error",
377
- # message="Feature search requires a GenBank formatted annotation file. Please select a file from the dropdown menu or search by position",
378
- # button=QtWidgets.QMessageBox.StandardButton.Ok
379
- # )
380
- # self.progressBar.setValue(0)
381
- # return
382
-
383
- # # make sure an annotation file has been selected
384
- # if self.annotation_files.currentText() == "None":
385
- # show_message(
386
- # fontSize=12,
387
- # icon=QtWidgets.QMessageBox.Icon.Critical,
388
- # title="Error",
389
- # message="Search by feature requires a GenBank annotation file. Please select one from the dropdown menu or search by position.",
390
- # button=QtWidgets.QMessageBox.StandardButton.Ok
391
- # )
392
- # self.progressBar.setValue(0)
393
- # return
394
-
395
- # # this now just goes onto the other version of run_results
396
- # myBool = self.run_results_own_ncbi_file(inputstring, self.annotation_files.currentText(), same_search, openAnnoWindow=openAnnoWindow)
397
- # if not openAnnoWindow:
398
- # return myBool
399
- # else:
400
- # self.progressBar.setValue(0)
401
- # return
402
-
403
- # if inputtype == "position":
404
- # full_org = str(self.orgChoice.currentText())
405
- # self.checked_info.clear()
406
- # self.check_ntseq_info.clear()
407
-
408
- # for item in inputstring: # Loop through each search
409
- # searchIndices = [x.strip() for x in item.split(',')] # Parse input query
410
-
411
- # if len(searchIndices) != 3:
412
- # show_message(
413
- # fontSize=12,
414
- # icon=QtWidgets.QMessageBox.Icon.Critical,
415
- # title="Error",
416
- # message="There are 3 arguments required for this function: chromosome, start position, and end position.",
417
- # button=QtWidgets.QMessageBox.StandardButton.Ok
418
- # )
419
- # self.progressBar.setValue(0)
420
- # return
421
-
422
- # if not searchIndices[0].isdigit() or not searchIndices[1].isdigit() or not searchIndices[2].isdigit():
423
- # show_message(
424
- # fontSize=12,
425
- # icon=QtWidgets.QMessageBox.Icon.Critical,
426
- # title="Error",
427
- # message="The positions given must be integers. Please try again.",
428
- # button=QtWidgets.QMessageBox.StandardButton.Ok
429
- # )
430
- # self.progressBar.setValue(0)
431
- # return
432
- # elif int(searchIndices[1]) >= int(searchIndices[2]):
433
- # show_message(
434
- # fontSize=12,
435
- # icon=QtWidgets.QMessageBox.Icon.Critical,
436
- # title="Error",
437
- # message="The start index must be less than the end index.",
438
- # button=QtWidgets.QMessageBox.StandardButton.Ok
439
- # )
440
- # self.progressBar.setValue(0)
441
- # return
442
- # elif abs(int(searchIndices[2])-int(searchIndices[1])) > 50000:
443
- # show_message(
444
- # fontSize=12,
445
- # icon=QtWidgets.QMessageBox.Icon.Critical,
446
- # title="Error",
447
- # message="The search range must be less than 50,000 nt.",
448
- # button=QtWidgets.QMessageBox.StandardButton.Ok
449
- # )
450
- # self.progressBar.setValue(0)
451
- # return
452
- # elif int(searchIndices[0]) > self.annotation_parser.get_max_chrom():
453
- # show_message(
454
- # fontSize=12,
455
- # icon=QtWidgets.QMessageBox.Icon.Critical,
456
- # title="Error",
457
- # message="Chromosome %s does not exist in the selected annotation file." % searchIndices[0],
458
- # button=QtWidgets.QMessageBox.StandardButton.Ok
459
- # )
460
- # self.progressBar.setValue(0)
461
- # return
462
- # # append the data into the checked_info
463
- # tempString = 'chrom: ' + str(searchIndices[0]) + ',start: ' + str(searchIndices[1]) + ',end: ' + str(searchIndices[2])
464
- # self.checked_info[tempString] = (int(searchIndices[0]), int(searchIndices[1])-1, int(searchIndices[2]))
465
-
466
- # self.progressBar.setValue(50)
467
- # self.Results.transfer_data(full_org, self.organisms_to_files[full_org], [str(self.endoChoice.currentText())], os.getcwd(), self.checked_info, self.check_ntseq_info,inputtype)
468
- # self.Results.load_gene_viewer()
469
- # self.progressBar.setValue(100)
470
- # self.pushButton_ViewTargets.setEnabled(True)
471
- # self.GenerateLibrary.setEnabled(True)
472
-
473
- # if inputtype == "sequence":
474
- # fileType = self.annotation_parser.find_which_file_version()
475
-
476
- # if fileType == -1:
477
- # show_message(
478
- # fontSize=12,
479
- # icon=QtWidgets.QMessageBox.Icon.Critical,
480
- # title="Error",
481
- # message="Search by sequence requires a GenBank annotation file. Please select one from the dropdown menu or search by position.",
482
- # button=QtWidgets.QMessageBox.StandardButton.Ok
483
- # )
484
- # self.progressBar.setValue(0)
485
- # return
486
- # if self.annotation_files.currentText() == "None":
487
- # show_message(
488
- # fontSize=12,
489
- # icon=QtWidgets.QMessageBox.Icon.Critical,
490
- # title="Error",
491
- # message="Search by sequence requires a GenBank annotation file. Please select one from the dropdown menu or search by position.",
492
- # button=QtWidgets.QMessageBox.StandardButton.Ok
493
- # )
494
- # self.progressBar.setValue(0)
495
- # return
496
-
497
- # checkString = 'AGTCN'
498
- # full_org = str(self.orgChoice.currentText())
499
- # self.checked_info.clear()
500
- # self.progressBar.setValue(10)
501
- # inputstring = inputstring.replace('\n','').upper().strip()
502
-
503
- # for letter in inputstring:
504
- # if letter not in checkString:
505
- # show_message(
506
- # fontSize=12,
507
- # icon=QtWidgets.QMessageBox.Icon.Critical,
508
- # title="Error",
509
- # message="The sequence must consist of A, G, T, C, or N. No other characters are allowed.",
510
- # button=QtWidgets.QMessageBox.StandardButton.Ok
511
- # )
512
- # self.progressBar.setValue(0)
513
- # return
514
-
515
- # if len(inputstring) < 100:
516
- # show_message(
517
- # fontSize=12,
518
- # icon=QtWidgets.QMessageBox.Icon.Critical,
519
- # title="Error",
520
- # message="The sequence given is too small. At least 100 characters are required.",
521
- # button=QtWidgets.QMessageBox.StandardButton.Ok
522
- # )
523
- # self.progressBar.setValue(0)
524
- # return
525
-
526
- # if len(inputstring) > 10000:
527
- # show_message(
528
- # fontSize=12,
529
- # icon=QtWidgets.QMessageBox.Icon.Question,
530
- # title="Large Sequence Detected",
531
- # message="The sequence given is too large one.\n\nPlease input a sequence less than 10kb in length.",
532
- # button=QtWidgets.QMessageBox.StandardButton.Yes
533
- # )
534
- # self.progressBar.setValue(0)
535
- # return
536
-
537
- # self.progressBar.setValue(30)
538
-
539
- # # Check the GBFF file for the sequence
540
- # my_check = self.annotation_parser.get_sequence_info(inputstring)
541
-
542
- # self.progressBar.setValue(55) # Update progress bar
543
-
544
- # if type(my_check) == bool:
545
- # show_message(
546
- # fontSize=12,
547
- # icon=QtWidgets.QMessageBox.Icon.Question,
548
- # title="Sequence Not Found",
549
- # message="The sequence entered was not found.\n\nPlease input a sequence that is in the selected organism.",
550
- # button=QtWidgets.QMessageBox.StandardButton.Yes
551
- # )
552
- # self.progressBar.setValue(0)
553
- # return
554
-
555
- # else:
556
- # tempString = 'chrom: ' + str(my_check[0]) + ',start: ' + str(my_check[1]) + ',end: ' + str(my_check[2])
557
- # self.checked_info[tempString] = (int(my_check[0]), int(my_check[1])-1, int(my_check[2]))
558
-
559
- # self.progressBar.setValue(75)
560
-
561
- # self.Results.transfer_data(full_org, self.organisms_to_files[full_org], [str(self.endoChoice.currentText())], os.getcwd(), self.checked_info, self.check_ntseq_info, inputtype)
562
- # self.Results.load_gene_viewer()
563
- # self.progressBar.setValue(100)
564
- # self.pushButton_ViewTargets.setEnabled(True)
565
- # self.GenerateLibrary.setEnabled(True)
566
- # except Exception as e:
567
- # show_error("Error in run_results() in main", e)
568
-
569
- # def handle_feature_search(self, input_string, open_anno_window):
570
- # file_type = self.annotation_parser.find_which_file_version()
571
- # if file_type == -1 or self.annotation_files.currentText() == "None":
572
- # self.show_error_message("Feature search requires a GenBank formatted annotation file.")
573
- # return False
574
-
575
- # return self.run_results_own_ncbi_file(input_string, self.annotation_files.currentText(), same_search, open_anno_window)
576
-
577
- # def launch_newGenome(self):
578
- # try:
579
- # # Update endo list
580
- # self.get_new_genome().fillEndo()
581
- # if self.get_new_genome().first_show:
582
- # center_ui(self.get_new_genome())
583
- # self.get_new_genome().first_show = False
584
- # self.hide()
585
- # self.get_new_genome().show()
586
- # except Exception as e:
587
- # show_error("Error in launch_newGenome() in main", e)
588
-
589
- # def launch_newEndonuclease(self):
590
- # try:
591
- # center_ui(self.newEndonuclease)
592
- # self.newEndonuclease.show()
593
- # self.newEndonuclease.activateWindow()
594
- # except Exception as e:
595
- # show_error("Error in launch_newEndonuclease() in main", e)
596
-
597
- # #launch genome browser tool
598
- # def launch_newGenomeBrowser(self):
599
- # try:
600
- # self.genomebrowser.createGraph(self)
601
- # except Exception as e:
602
- # show_error("Error in launch_newGenomeBrowser() in main", e)
603
-
604
- # def launch_ncbi(self):
605
- # try:
606
- # show_message(
607
- # fontSize=12,
608
- # icon=QtWidgets.QMessageBox.Icon.Information,
609
- # title="Note:",
610
- # message="NCBI Annotation Guidelines:\n\nDownload annotation files of the exact species and strain used in Analyze New Genome.\n\nMismatched annotation files will inhibit downstream analyses.",
611
- # button=QtWidgets.QMessageBox.StandardButton.Ok
612
- # )
613
- # if self.ncbi.first_show:
614
- # self.ncbi.first_show = False
615
- # center_ui(self.ncbi)
616
-
617
- # self.ncbi.show()
618
- # self.ncbi.activateWindow()
619
- # except Exception as e:
620
- # show_error("launch_ncbi() in main", e)
621
-
622
- # # this function does the same stuff that the other collect_table_data does, but works with the other types of files
623
- # def collect_table_data_nonkegg(self):
624
- # try:
625
- # # start out the same as the other collect_table_data
626
- # self.checked_info.clear()
627
- # self.genlib_list.clear()
628
- # self.check_ntseq_info.clear()
629
- # full_org = str(self.orgChoice.currentText())
630
- # holder = ()
631
- # selected_indices = []
632
- # selected_rows = self.Annotation_Window.tableWidget.selectionModel().selectedRows()
633
- # for ind in sorted(selected_rows):
634
- # selected_indices.append(ind.row())
635
-
636
- # for item in self.checkBoxes:
637
- # feature = item[1]
638
- # # If inidices of checkBoxes list and selected rows in table match...
639
- # if item[2] in selected_indices:
640
- # holder = (item[0],int(feature.location.start),int(feature.location.end)) # Tuple order: Feature chromosome/scaffold number, feature start, feature end
641
- # ### If locus tag available, combine with gene name to create dict key
642
- # if 'locus_tag' in feature.qualifiers:
643
- # tag = feature.qualifiers['locus_tag'][0]
644
- # key = tag + ": " + get_name(feature)
645
- # else:
646
- # key = get_name(feature)
647
- # self.checked_info[key] = holder
648
- # self.genlib_list.append((item[0],feature)) # Tuple order: Feature chromosome/scaffold number, SeqFeature object
649
- # else:
650
- # # If item was not selected in the table, go to the next item
651
- # continue
652
-
653
- # # now call transfer data
654
- # self.progressBar.setValue(95)
655
- # self.Results.transfer_data(full_org, self.organisms_to_files[full_org], [str(self.endoChoice.currentText())], os.getcwd(),
656
- # self.checked_info, self.check_ntseq_info,inputtype="feature")
657
- # self.Results.load_gene_viewer()
658
-
659
- # self.progressBar.setValue(100)
660
- # self.pushButton_ViewTargets.setEnabled(True)
661
- # self.GenerateLibrary.setEnabled(True)
662
- # except Exception as e:
663
- # show_error("Error in collect_table_data_nonkegg() in main", e)
664
-
665
- # def separate_line(self, input_string):
666
- # try:
667
- # export_array = []
668
- # while True:
669
- # index = input_string.find('\n')
670
- # if index == -1:
671
- # if len(input_string) == 0:
672
- # return export_array
673
- # else:
674
- # export_array.append(input_string)
675
- # return export_array
676
- # export_array.append(input_string[:index])
677
- # input_string = input_string[index + 1:]
678
- # except Exception as e:
679
- # show_error("Error in seperate_line() in main", e)
680
-
681
- # def removeWhiteSpace(self, strng):
682
- # try:
683
- # while True:
684
- # if len(strng) == 0 or (strng[0] != " " and strng[0] != "\n"):
685
- # break
686
- # strng = strng[1:]
687
- # while True:
688
- # if len(strng) == 0 or (strng[len(strng) - 1] != " " and strng[0] != "\n"):
689
- # return strng
690
- # strng = strng[:len(strng) - 1]
691
- # except Exception as e:
692
- # show_error("Error in removeWhiteSpace() in main", e)
693
-
694
- # # Function to enable and disable the Annotation function if searching by position or sequence
695
- # def toggle_annotation(self):
696
- # try:
697
- # if self.radioButton_Gene.isChecked():
698
- # self.Step2.setEnabled(True)
699
- # else:
700
- # self.Step2.setEnabled(True)
701
- # except Exception as e:
702
- # show_error("Error in toggle_annotation() in main", e)
703
-
704
- # def fill_annotation_dropdown(self):
705
- # try:
706
- # #recursive search for all GenBank files in casper db folder
707
- # self.annotation_files.clear()
708
- # annotation_files = glob.glob(GlobalSettings.CSPR_DB + "/**/*.gb*", recursive=True)
709
- # if platform.system() == "Windows":
710
- # for i in range(len(annotation_files)):
711
- # annotation_files[i] = annotation_files[i].replace("/","\\")
712
- # annotation_files[i] = annotation_files[i][annotation_files[i].rfind("\\") + 1:]
713
- # else:
714
- # for i in range(len(annotation_files)):
715
- # annotation_files[i] = annotation_files[i].replace("\\","/")
716
- # annotation_files[i] = annotation_files[i][annotation_files[i].rfind("/") + 1:]
717
-
718
- # annotation_files.sort(key=str.lower)
719
- # self.annotation_files.addItems(annotation_files)
720
- # self.annotation_files.addItems(["None"])
721
- # except Exception as e:
722
- # show_error("Error in fill_annotation_dropdown() in main", e)
723
-
724
- # def make_dictonary(self):
725
- # try:
726
- # url = "https://www.genome.jp/dbget-bin/get_linkdb?-t+genes+gn:" + self.TNumbers[
727
- # self.Annotations_Organism.currentText()]
728
- # source_code = requests.get(url, verify=False)
729
- # plain_text = source_code.text
730
- # buf = io.StringIO(plain_text)
731
-
732
- # while True:
733
- # line = buf.readline()
734
- # if line[0] == "-":
735
- # break
736
- # while True:
737
- # line = buf.readline()
738
- # if line[1] != "a":
739
- # return
740
- # line = line[line.find(">") + 1:]
741
- # seq = line[line.find(":") + 1:line.find("<")]
742
- # line = line[line.find(">") + 1:]
743
-
744
- # i = 0
745
- # while True:
746
- # if line[i] == " ":
747
- # i = i + 1
748
- # else:
749
- # break
750
- # key = line[i:line.find("\n") - 1]
751
- # if key in self.gene_list:
752
- # if seq not in self.gene_list[key]:
753
- # self.gene_list[key].append(seq)
754
- # else:
755
- # self.gene_list[key] = [seq]
756
- # z = 5
757
- # except Exception as e:
758
- # show_error("Error in make_dictionary() in main", e)
759
-
760
- # def organism_finder(self, long_str):
761
- # try:
762
- # semi = long_str.find(";")
763
- # index = 1
764
- # while True:
765
- # if long_str[semi - index] == " ":
766
- # break
767
- # index = index + 1
768
- # return long_str[:semi - index]
769
- # except Exception as e:
770
- # show_error("Error in organism_finder() in main", e)
771
-
772
- # # This method is for testing the execution of a button call to make sure the button is linked properly
773
- # def testexe(self):
774
- # try:
775
- # msgBox = QtWidgets.QMessageBox()
776
- # msgBox.setStyleSheet("font: " + str(self.fontSize) + "pt 'Arial'")
777
- # msgBox.setIcon(QtWidgets.QMessageBox.Icon.Question)
778
- # msgBox.setWindowTitle("Extract!")
779
- # msgBox.setText(
780
- # "Are you sure you want to quit?")
781
- # msgBox.addButton(QtWidgets.QMessageBox.StandardButton.Yes)
782
- # msgBox.addButton(QtWidgets.QMessageBox.StandardButton.No)
783
- # msgBox.exec()
784
-
785
- # if msgBox.result() == QtWidgets.QMessageBox.Yes:
786
- # # print(self.orgChoice.currentText())
787
- # sys.exit()
788
- # else:
789
- # pass
790
- # except Exception as e:
791
- # show_error("Error in testexe() in main", e)
792
-
793
- # def getData(self):
794
- # try:
795
- # try:
796
- # self.orgChoice.currentIndexChanged.disconnect()
797
- # except Exception as e:
798
- # pass
799
-
800
- # self.orgChoice.clear()
801
- # self.endoChoice.clear()
802
- # mypath = os.getcwd()
803
- # found = False
804
- # self.dbpath = mypath
805
- # onlyfiles = [str(f) for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))]
806
- # onlyfiles.sort(key=str.lower)
807
- # self.organisms_to_files = {}
808
- # self.organisms_to_endos = {}
809
- # first = True
810
- # for file in onlyfiles:
811
- # if file.find('.cspr') != -1:
812
- # if first == True:
813
- # first = False
814
- # found = True
815
- # newname = file[0:-4]
816
- # endo = newname[newname.rfind("_")+1:-1]
817
- # hold = open(file, 'r')
818
- # buf = (hold.readline())
819
- # buf = str(buf)
820
- # buf = buf.strip()
821
- # species = buf.replace("GENOME: ",'')
822
-
823
- # if species in self.organisms_to_files:
824
- # self.organisms_to_files[species][endo] = [file, file.replace(".cspr", "_repeats.db")]
825
- # else:
826
- # self.organisms_to_files[species] = {}
827
- # self.organisms_to_files[species][endo] = [file, file.replace(".cspr", "_repeats.db")]
828
-
829
- # if species in self.organisms_to_endos:
830
- # self.organisms_to_endos[species].append(endo)
831
- # else:
832
- # self.organisms_to_endos[species] = [endo]
833
- # if self.orgChoice.findText(species) == -1:
834
- # self.orgChoice.addItem(species)
835
-
836
- # if found == False:
837
- # return False
838
-
839
- # self.endoChoice.clear()
840
- # self.endoChoice.addItems(self.organisms_to_endos[str(self.orgChoice.currentText())])
841
- # self.orgChoice.currentIndexChanged.connect(self.changeEndos)
842
- # except Exception as e:
843
- # show_error("Error in getData() in main.", e)
844
-
845
- # def changeEndos(self):
846
- # try:
847
- # if self.orgChoice.currentText() != "Custom Input Sequences":
848
- # self.Step2.setEnabled(True)
849
- # self.endoChoice.setEnabled(True)
850
- # self.radioButton_Gene.show()
851
- # self.radioButton_Position.show()
852
- # self.endoChoice.clear()
853
- # self.endoChoice.addItems(self.organisms_to_endos[str(self.orgChoice.currentText())])
854
- # else:
855
- # self.Step2.setEnabled(False)
856
- # self.endoChoice.clear()
857
- # self.endoChoice.setEnabled(False)
858
- # self.radioButton_Gene.hide()
859
- # self.radioButton_Position.hide()
860
- # except Exception as e:
861
- # show_error("Error in changeEndos() in main", e)
862
-
863
- # def change_directory(self):
864
- # try:
865
- # mydir = QtWidgets.QFileDialog.getExistingDirectory(
866
- # None, "Open a folder...", self.dbpath, QtWidgets.QFileDialog.Option.ShowDirsOnly)
867
-
868
- # if not os.path.isdir(mydir):
869
- # show_message(
870
- # fontSize=12,
871
- # icon=QtWidgets.QMessageBox.Icon.Critical,
872
- # title="Not a directory",
873
- # message="The directory you selected does not exist."
874
- # )
875
- # return
876
-
877
- # if not any(file.endswith(".cspr") for file in os.listdir(mydir)):
878
- # show_message(
879
- # fontSize=12,
880
- # icon=QtWidgets.QMessageBox.Icon.Critical,
881
- # title="Directory is invalid!",
882
- # message="You must select a directory with CSPR Files!"
883
- # )
884
- # return
885
-
886
- # os.chdir(mydir)
887
- # mydir = mydir.replace("/", "\\") if platform.system() == "Windows" else mydir
888
- # GlobalSettings.CSPR_DB = mydir
889
-
890
- # GlobalSettings.MTWin.directory = mydir
891
- # GlobalSettings.MTWin.get_data()
892
- # GlobalSettings.pop_Analysis.get_data()
893
- # self.getData()
894
- # self.fill_annotation_dropdown()
895
- # except Exception as e:
896
- # show_error("Error in change_directory() in main.", e)
897
-
898
- # def changeto_multitargeting(self):
899
- # try:
900
- # os.chdir(os.getcwd())
901
- # if GlobalSettings.MTWin.first_show == True:
902
- # GlobalSettings.MTWin.show()
903
- # GlobalSettings.MTWin.first_show = False
904
- # else:
905
- # GlobalSettings.MTWin.show()
906
- # GlobalSettings.mainWindow.hide()
907
-
908
- # except Exception as e:
909
- # show_error("Error in changeto_multitargeting() in main.", e)
910
-
911
- # #change to population analysis window
912
- # def changeto_population_Analysis(self):
913
- # try:
914
- # GlobalSettings.pop_Analysis.launch()
915
- # if GlobalSettings.pop_Analysis.first_show == True:
916
- # center_ui(GlobalSettings.pop_Analysis)
917
- # GlobalSettings.pop_Analysis.first_show = False
918
- # GlobalSettings.pop_Analysis.show()
919
- # GlobalSettings.mainWindow.hide()
920
- # except Exception as e:
921
- # show_error("Error in changeto_population_Analysis() in main.", e)
922
-
923
- # def annotation_information(self):
924
- # try:
925
- # show_message(
926
- # fontSize=12,
927
- # icon=QtWidgets.QMessageBox.Icon.Critical,
928
- # title="Annotation Information",
929
- # message="Annotation files are used for searching for spacers on a gene/locus basis and can be selected here using either " \
930
- # "NCBI databases or a local file."
931
- # )
932
- # except Exception as e:
933
- # show_error("Error in annotation_information() in main.", e)
934
-
935
- # @QtCore.pyqtSlot()
936
- # def view_results(self):
937
- # try:
938
- # #center results window on current screen
939
- # if self.Results.first_show == True:
940
- # self.Results.first_show = False
941
- # self.Results.centerUI()
942
-
943
- # self.Results.show()
944
- # self.hide()
945
- # except Exception as e:
946
- # show_error("Error in view_results() in main", e)
947
-
948
- # def closeFunction(self):
949
- # try:
950
- # # Attempt to close the NCBI window if it exists
951
- # try:
952
- # self.ncbi.close()
953
- # except AttributeError:
954
- # print("No NCBI window to close.")
955
-
956
- # self.myClosingWindow.get_files()
957
- # center_ui(self.myClosingWindow)
958
- # self.myClosingWindow.show()
959
- # except Exception as e:
960
- # show_error("Error in closeFunction() in main", e)
961
-
962
- # def close_app(self):
963
- # try:
964
- # # Attempt to close the NCBI window if it exists
965
- # try:
966
- # self.ncbi.close()
967
- # except Exception as e:
968
- # print("No NCBI window to close.")
969
-
970
- # self.closeFunction()
971
- # self.close()
972
- # except Exception as e:
973
- # show_error("Error in close_app() in main", e)
974
-
975
- # def load_dropdown_data(self):
976
- # """Fill in organism/endo/annotation dropdown information."""
977
- # try:
978
- # self.getData()
979
- # self.fill_annotation_dropdown()
980
- # # self.logger.debug("Successfully loaded organism/endo/annotation drop down information in Main.")
981
- # except Exception as e:
982
- # show_error("Error in load_dropdown_data() in Main", e)
983
-
984
- # # Call methods for other windows if needed
985
- # # self.load_mt_data()
986
- # # self.load_pop_analysis_data()
987
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/views/CloseableTabWidget.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PyQt6.QtWidgets import QTabWidget, QTabBar, QToolButton, QWidget
2
+ from PyQt6.QtCore import pyqtSignal, QSize, Qt
3
+ from PyQt6.QtGui import QCursor
4
+ from PyQt6 import QtWidgets
5
+ import logging
6
+
7
+ class CloseableTabWidget(QTabWidget):
8
+ tab_closed = pyqtSignal(QWidget)
9
+
10
+ def __init__(self, parent=None):
11
+ super().__init__(parent)
12
+ self.setTabsClosable(False)
13
+ self.tabCloseRequested.connect(self.closeTab)
14
+ self._tabs = {} # Dictionary to keep track of tab widgets
15
+ self.tabBar().tabMoved.connect(self._handle_tab_moved)
16
+ self.logger = logging.getLogger(__name__)
17
+
18
+ def closeTab(self, index):
19
+ """Close a tab at the given index"""
20
+ self.logger.debug(f"Attempting to close tab at index {index}")
21
+
22
+ if not (self.count() > 1 and index != 0):
23
+ self.logger.debug("Tab closure conditions not met")
24
+ return
25
+
26
+ widget = self.widget(index)
27
+ if not widget:
28
+ self.logger.warning(f"No widget found at index {index}")
29
+ return
30
+
31
+ # Critical operations need try-catch
32
+ try:
33
+ tab_text = self.tabText(index)
34
+
35
+ # Cleanup controller if exists
36
+ controller = getattr(widget, 'controller', None)
37
+ if controller and hasattr(controller, 'model') and hasattr(controller.model, 'cleanup'):
38
+ controller.model.cleanup()
39
+
40
+ # Remove from tracking and emit signal
41
+ if tab_text in self._tabs:
42
+ del self._tabs[tab_text]
43
+
44
+ self.removeTab(index)
45
+ self.tab_closed.emit(widget)
46
+ widget.deleteLater()
47
+ self._update_all_tabs()
48
+
49
+ self.logger.debug(f"Successfully closed tab '{tab_text}'")
50
+ except Exception as e:
51
+ self.logger.error(f"Failed to close tab: {e}", exc_info=True)
52
+ raise
53
+
54
+ def addTab(self, widget, label):
55
+ try:
56
+ if widget and label:
57
+ # Store widget reference with unique identifier
58
+ tab_id = f"{label}_{id(widget)}"
59
+ self._tabs[tab_id] = {
60
+ 'widget': widget,
61
+ 'label': label,
62
+ 'close_button': None
63
+ }
64
+
65
+ # Add the tab
66
+ index = super().addTab(widget, label)
67
+
68
+ if index != 0:
69
+ # Create and setup close button
70
+ close_button = self._create_close_button(index, label)
71
+ self._tabs[tab_id]['close_button'] = close_button
72
+ self.tabBar().setTabButton(index, QTabBar.ButtonPosition.RightSide, close_button)
73
+
74
+ return index
75
+ except Exception as e:
76
+ self.logger.error(f"Error adding tab: {e}")
77
+ return -1
78
+
79
+ def _create_close_button(self, index, label):
80
+ """Create a new close button for a tab"""
81
+ close_button = QToolButton(self.tabBar())
82
+ close_button.setObjectName(f"close_button_{label}")
83
+ close_icon = self.style().standardIcon(QtWidgets.QStyle.StandardPixmap.SP_TitleBarCloseButton)
84
+ close_button.setIcon(close_icon)
85
+ close_button.setIconSize(QSize(16, 16))
86
+ close_button.setAutoRaise(True)
87
+ close_button.setStyleSheet("""
88
+ QToolButton {
89
+ border: none;
90
+ padding: 0px;
91
+ }
92
+ QToolButton:hover {
93
+ background: #c42b1c;
94
+ }
95
+ """)
96
+ close_button.setCursor(QCursor(Qt.CursorShape.PointingHandCursor))
97
+ close_button.setFixedSize(18, 18)
98
+ close_button.clicked.connect(lambda checked, idx=index: self.safely_close_tab(idx))
99
+ return close_button
100
+
101
+ def safely_close_tab(self, index):
102
+ """Safely handle tab closure with error checking"""
103
+ try:
104
+ if 0 <= index < self.count():
105
+ current_widget = self.widget(index)
106
+ if current_widget and index != 0:
107
+ self.closeTab(index)
108
+ except Exception as e:
109
+ self.logger.error(f"Error in safely_close_tab: {e}")
110
+
111
+ def _handle_tab_moved(self, from_index: int, to_index: int):
112
+ """Handle tab movement and update close buttons"""
113
+ try:
114
+ self._update_all_tabs()
115
+ except Exception as e:
116
+ self.logger.error(f"Error handling tab movement: {e}")
117
+
118
+ def _update_all_tabs(self):
119
+ """Update all tabs and their close buttons"""
120
+ try:
121
+ for i in range(1, self.count()): # Skip index 0 (home tab)
122
+ widget = self.widget(i)
123
+ if widget:
124
+ label = self.tabText(i)
125
+ tab_id = f"{label}_{id(widget)}"
126
+
127
+ # Create new close button if needed
128
+ if tab_id not in self._tabs or not self._tabs[tab_id].get('close_button'):
129
+ close_button = self._create_close_button(i, label)
130
+ self._tabs[tab_id] = {
131
+ 'widget': widget,
132
+ 'label': label,
133
+ 'close_button': close_button
134
+ }
135
+ self.tabBar().setTabButton(i, QTabBar.ButtonPosition.RightSide, close_button)
136
+ else:
137
+ # Update existing close button's click connection
138
+ close_button = self._tabs[tab_id]['close_button']
139
+ close_button.clicked.disconnect()
140
+ close_button.clicked.connect(lambda checked, idx=i: self.safely_close_tab(idx))
141
+ except Exception as e:
142
+ self.logger.error(f"Error updating tabs: {e}")
143
+
144
+ def moveTab(self, from_index, to_index):
145
+ """Override moveTab to safely handle tab movement"""
146
+ try:
147
+ if (0 <= from_index < self.count() and
148
+ 0 <= to_index < self.count() and
149
+ from_index != 0 and
150
+ to_index != 0):
151
+
152
+ super().moveTab(from_index, to_index)
153
+ self._update_all_tabs()
154
+
155
+ except Exception as e:
156
+ self.logger.error(f"Error moving tab: {e}")
src/views/FindTargetsView.py CHANGED
@@ -1,95 +1,148 @@
1
  from PyQt6 import QtWidgets
2
- from PyQt6.QtWidgets import QWidget, QVBoxLayout, QTableWidget, QTableWidgetItem, QPushButton, QHBoxLayout, QLabel
 
3
  from PyQt6 import uic
4
  from PyQt6.QtCore import Qt, QTimer
 
5
 
6
  class FindTargetsView(QtWidgets.QMainWindow):
7
  def __init__(self, global_settings):
8
  super().__init__()
9
  self.global_settings = global_settings
10
  self._init_ui()
 
 
 
11
 
12
  def _init_ui(self):
13
  uic.loadUi(self.global_settings.get_ui_dir_path() + '/find_targets.ui', self)
14
  self.results_table = self.findChild(QTableWidget, 'tblTargets')
 
 
15
  self.results_table.setSelectionBehavior(QTableWidget.SelectionBehavior.SelectRows)
 
 
16
 
17
- # Optimize table performance
18
- self.results_table.setUpdatesEnabled(False) # Disable updates during setup
19
- self.results_table.setSortingEnabled(False) # Disable sorting during setup
20
- self.results_table.horizontalHeader().setStretchLastSection(True)
21
 
22
- # Set up the table columns
23
- self.results_table.setColumnCount(7)
24
- self.results_table.setHorizontalHeaderLabels([
 
 
 
 
25
  "Feature Type", "Chromosome/Scaffold #", "Feature ID/Locus Tag",
26
- "Feature Name", "Feature Description", "Location", "Strand"
27
- ])
28
-
29
- self.push_button_view_targets = self.findChild(QPushButton, 'pbtnViewTargets')
 
 
 
 
 
 
30
 
31
- # Pre-allocate items for better performance
32
- self._cached_items = {}
 
 
33
 
34
- def _get_table_item(self, text):
35
- """Cache and reuse QTableWidgetItems for better performance"""
36
- if text not in self._cached_items:
37
- item = QTableWidgetItem(str(text))
38
- item.setFlags(item.flags() & ~Qt.ItemFlag.ItemIsEditable) # Make item read-only
39
- self._cached_items[text] = item
40
- return self._cached_items[text].clone()
 
 
 
 
 
 
 
 
41
 
42
  def display_results(self, results):
43
- # Disable updates for bulk operations
 
 
 
 
 
 
44
  self.results_table.setUpdatesEnabled(False)
45
  self.results_table.setSortingEnabled(False)
 
46
 
47
- # Set row count once
48
- self.results_table.setRowCount(len(results))
49
-
50
- # Batch insert items
51
- for row, result in enumerate(results):
52
- self.results_table.setItem(row, 0, self._get_table_item(result['feature_type']))
53
- self.results_table.setItem(row, 1, self._get_table_item(str(result['chromosome'])))
54
- self.results_table.setItem(row, 2, self._get_table_item(result['feature_id']))
55
- self.results_table.setItem(row, 3, self._get_table_item(result['feature_name']))
56
- self.results_table.setItem(row, 4, self._get_table_item(result['feature_description']))
57
- self.results_table.setItem(row, 5, self._get_table_item(result['location']))
58
- self.results_table.setItem(row, 6, self._get_table_item(result['strand']))
59
-
60
- # Re-enable updates and adjust columns
61
- QTimer.singleShot(0, self._finish_table_update)
62
-
63
- def _finish_table_update(self):
64
- """Complete table update in the next event loop iteration"""
65
- self.results_table.resizeColumnsToContents()
66
  self.results_table.setUpdatesEnabled(True)
67
  self.results_table.setSortingEnabled(True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  def get_selected_targets(self):
70
  selected_rows = set(index.row() for index in self.results_table.selectedIndexes())
71
  selected_targets = []
72
 
73
- # Get column indices once
74
- columns = {
75
- 'feature_type': 0,
76
- 'chromosome': 1,
77
- 'feature_id': 2,
78
- 'feature_name': 3,
79
- 'feature_description': 4,
80
- 'location': 5,
81
- 'strand': 6
82
- }
83
-
84
  for row in selected_rows:
85
- target = {
86
- 'feature_type': self.results_table.item(row, columns['feature_type']).text(),
87
- 'chromosome': self.results_table.item(row, columns['chromosome']).text(),
88
- 'feature_id': self.results_table.item(row, columns['feature_id']).text(),
89
- 'feature_name': self.results_table.item(row, columns['feature_name']).text(),
90
- 'feature_description': self.results_table.item(row, columns['feature_description']).text(),
91
- 'location': self.results_table.item(row, columns['location']).text(),
92
- 'strand': self.results_table.item(row, columns['strand']).text()
93
- }
94
- selected_targets.append(target)
95
  return selected_targets
 
 
 
 
 
 
 
 
 
 
1
  from PyQt6 import QtWidgets
2
+ from PyQt6.QtWidgets import (QWidget, QVBoxLayout, QTableWidget, QTableWidgetItem,
3
+ QPushButton, QHBoxLayout, QLabel, QAbstractItemView)
4
  from PyQt6 import uic
5
  from PyQt6.QtCore import Qt, QTimer
6
+ import time
7
 
8
  class FindTargetsView(QtWidgets.QMainWindow):
9
  def __init__(self, global_settings):
10
  super().__init__()
11
  self.global_settings = global_settings
12
  self._init_ui()
13
+ self.batch_size = 100 # Number of rows to load at once
14
+ self._all_results = [] # Store all results
15
+ self._loaded_rows = 0 # Track number of loaded rows
16
 
17
  def _init_ui(self):
18
  uic.loadUi(self.global_settings.get_ui_dir_path() + '/find_targets.ui', self)
19
  self.results_table = self.findChild(QTableWidget, 'tblTargets')
20
+
21
+ # Optimize table settings for large datasets
22
  self.results_table.setSelectionBehavior(QTableWidget.SelectionBehavior.SelectRows)
23
+ self.results_table.setShowGrid(False)
24
+ self.results_table.setAlternatingRowColors(True)
25
 
26
+ # Enable virtual scrolling mode
27
+ self.results_table.setVerticalScrollMode(QTableWidget.ScrollMode.ScrollPerPixel)
28
+ self.results_table.setHorizontalScrollMode(QTableWidget.ScrollMode.ScrollPerPixel)
 
29
 
30
+ # Optimize viewport updates
31
+ self.results_table.setVerticalScrollBarPolicy(Qt.ScrollBarPolicy.ScrollBarAlwaysOn)
32
+ self.results_table.viewport().setProperty("cursor", Qt.CursorShape.ArrowCursor)
33
+
34
+ # Set table properties for better performance
35
+ self.results_table.setColumnCount(5) # Reduced from 7 to 5 columns
36
+ headers = [
37
  "Feature Type", "Chromosome/Scaffold #", "Feature ID/Locus Tag",
38
+ "Feature Name", "Feature Description"
39
+ ]
40
+ self.results_table.setHorizontalHeaderLabels(headers)
41
+
42
+ # Set optimized column widths
43
+ column_widths = [100, 150, 150, 150, 300] # Adjusted widths
44
+ for i, width in enumerate(column_widths):
45
+ self.results_table.setColumnWidth(i, width)
46
+
47
+ self.results_table.horizontalHeader().setStretchLastSection(True)
48
 
49
+ # Connect scroll events for virtual scrolling
50
+ self.results_table.verticalScrollBar().valueChanged.connect(self._handle_scroll)
51
+
52
+ self.push_button_view_targets = self.findChild(QPushButton, 'pbtnViewTargets')
53
 
54
+ def _create_table_item(self, text):
55
+ """Optimized item creation"""
56
+ item = QTableWidgetItem(str(text))
57
+ item.setFlags(item.flags() & ~Qt.ItemFlag.ItemIsEditable)
58
+ return item
59
+
60
+ def _create_row_items(self, result):
61
+ """Create all items for a row at once"""
62
+ return [
63
+ self._create_table_item(result['feature_type']),
64
+ self._create_table_item(str(result['chromosome'])),
65
+ self._create_table_item(result['feature_id']),
66
+ self._create_table_item(result['feature_name']),
67
+ self._create_table_item(result['feature_description'])
68
+ ]
69
 
70
  def display_results(self, results):
71
+ start_time = time.time()
72
+
73
+ # Store all results and reset loaded count
74
+ self._all_results = results
75
+ self._loaded_rows = 0
76
+
77
+ # Disable visual updates
78
  self.results_table.setUpdatesEnabled(False)
79
  self.results_table.setSortingEnabled(False)
80
+ self.results_table.setVisible(False)
81
 
82
+ # Set total row count
83
+ total_rows = len(results)
84
+ self.results_table.setRowCount(total_rows)
85
+
86
+ # Load initial batch
87
+ self._load_batch(0, min(self.batch_size, total_rows))
88
+
89
+ # Re-enable table and updates
90
+ self.results_table.setVisible(True)
 
 
 
 
 
 
 
 
 
 
91
  self.results_table.setUpdatesEnabled(True)
92
  self.results_table.setSortingEnabled(True)
93
+
94
+ total_time = time.time() - start_time
95
+ self.global_settings.logger.debug(f"Initial display time: {total_time:.2f} seconds")
96
+
97
+ def _load_batch(self, start_idx, end_idx):
98
+ """Load a batch of rows efficiently"""
99
+ if start_idx >= len(self._all_results) or start_idx >= end_idx:
100
+ return
101
+
102
+ batch_items = []
103
+ for row in range(start_idx, end_idx):
104
+ if row >= len(self._all_results):
105
+ break
106
+ row_items = self._create_row_items(self._all_results[row])
107
+ batch_items.append((row, row_items))
108
+
109
+ # Batch set items
110
+ for row, items in batch_items:
111
+ for col, item in enumerate(items):
112
+ self.results_table.setItem(row, col, item)
113
+
114
+ self._loaded_rows = end_idx
115
+
116
+ def _handle_scroll(self, value):
117
+ """Handle scroll events for virtual scrolling"""
118
+ viewport_height = self.results_table.viewport().height()
119
+ row_height = self.results_table.rowHeight(0)
120
+ visible_rows = viewport_height // row_height
121
+
122
+ # Calculate which rows should be visible
123
+ scroll_position = value
124
+ start_row = max(0, scroll_position - visible_rows)
125
+ end_row = min(len(self._all_results), scroll_position + visible_rows * 2)
126
+
127
+ # Load more rows if needed
128
+ if end_row > self._loaded_rows:
129
+ self._load_batch(self._loaded_rows, end_row)
130
 
131
  def get_selected_targets(self):
132
  selected_rows = set(index.row() for index in self.results_table.selectedIndexes())
133
  selected_targets = []
134
 
 
 
 
 
 
 
 
 
 
 
 
135
  for row in selected_rows:
136
+ if row < len(self._all_results):
137
+ selected_targets.append(self._all_results[row])
138
+
 
 
 
 
 
 
 
139
  return selected_targets
140
+
141
+ def clear_results(self):
142
+ """Clear all results from the table"""
143
+ self.results_table.setUpdatesEnabled(False)
144
+ self.results_table.clearContents()
145
+ self.results_table.setRowCount(0)
146
+ self._all_results = []
147
+ self._loaded_rows = 0
148
+ self.results_table.setUpdatesEnabled(True)
src/views/HomeWindowView.py CHANGED
@@ -7,6 +7,7 @@ class HomeWindowView(QWidget):
7
  def __init__(self, global_settings):
8
  super().__init__()
9
  self.global_settings = global_settings
 
10
  self._init_ui()
11
 
12
  def _init_ui(self) -> None:
@@ -96,9 +97,9 @@ class HomeWindowView(QWidget):
96
  self.combo_box_organism.clear()
97
  self.combo_box_organism.addItems(organisms)
98
 
99
- def update_combo_box_annotation_files(self, annotation_files: list) -> None:
100
- self.combo_box_local_annotation_files.clear()
101
- self.combo_box_local_annotation_files.addItems(annotation_files)
102
 
103
  def set_progress_bar(self, value: int) -> None:
104
  self.progress_bar_find_targets.setValue(value)
@@ -126,4 +127,24 @@ class HomeWindowView(QWidget):
126
  return "feature" # Default to feature if somehow none are selected
127
 
128
  def get_annotation_file(self) -> str:
129
- return self.combo_box_local_annotation_files.currentText()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  def __init__(self, global_settings):
8
  super().__init__()
9
  self.global_settings = global_settings
10
+ self.logger = self.global_settings.logger
11
  self._init_ui()
12
 
13
  def _init_ui(self) -> None:
 
97
  self.combo_box_organism.clear()
98
  self.combo_box_organism.addItems(organisms)
99
 
100
+ # def update_combo_box_annotation_files(self, annotation_files: list) -> None:
101
+ # self.combo_box_local_annotation_files.clear()
102
+ # self.combo_box_local_annotation_files.addItems(annotation_files)
103
 
104
  def set_progress_bar(self, value: int) -> None:
105
  self.progress_bar_find_targets.setValue(value)
 
127
  return "feature" # Default to feature if somehow none are selected
128
 
129
  def get_annotation_file(self) -> str:
130
+ return self.combo_box_local_annotation_files.currentText()
131
+
132
+ def update_combo_box_annotation_files(self, files):
133
+ """Update local annotation files combo box, excluding .index files"""
134
+ try:
135
+ # Clear existing items
136
+ self.combo_box_local_annotation_files.clear()
137
+
138
+ # Filter out .index files
139
+ filtered_files = [f for f in files if not f.endswith('.index')]
140
+
141
+ # Add filtered files to combo box
142
+ if filtered_files:
143
+ self.combo_box_local_annotation_files.addItems(filtered_files)
144
+ self.combo_box_local_annotation_files.setCurrentIndex(0)
145
+ self.logger.debug(f"Added {len(filtered_files)} local annotation files to combo box")
146
+ else:
147
+ self.logger.debug("No local annotation files found")
148
+
149
+ except Exception as e:
150
+ self.logger.error(f"Error updating local annotation files: {str(e)}")
src/views/MainWindowView.py CHANGED
@@ -1,238 +1,68 @@
1
  from PyQt6.QtWidgets import (
2
- QMainWindow, QPushButton, QRadioButton, QComboBox, QPlainTextEdit,
3
- QProgressBar, QMenuBar, QMenu, QStackedWidget, QWidget, QVBoxLayout,
4
- QHBoxLayout, QLabel, QFrame, QTabWidget, QToolButton, QTabBar
5
  )
6
- from PyQt6.QtGui import QIcon, QAction, QFont, QCursor
7
- from PyQt6.QtCore import Qt, QPoint, pyqtSignal, QSize
8
  from PyQt6 import uic, QtWidgets, QtCore, QtGui
9
- from utils.ui import scale_ui, show_error
 
10
  import os
11
  from typing import Optional
12
- from functools import partial
13
  import qdarktheme
 
14
 
15
- class CloseableTabWidget(QTabWidget):
16
- tab_closed = pyqtSignal(QWidget)
17
-
18
- def __init__(self, parent=None):
19
- super().__init__(parent)
20
- self.setTabsClosable(False)
21
- self.tabCloseRequested.connect(self.closeTab)
22
- self._tabs = {} # Dictionary to keep track of tab widgets
23
- self.tabBar().tabMoved.connect(self._handle_tab_moved)
24
-
25
- def closeTab(self, index):
26
- try:
27
- if self.count() > 1 and index != 0:
28
- widget = self.widget(index)
29
- if widget:
30
- # Get tab text before removal
31
- tab_text = self.tabText(index)
32
-
33
- # Clean up the controller if it exists
34
- controller = getattr(widget, 'controller', None)
35
- if controller and hasattr(controller, 'model') and hasattr(controller.model, 'cleanup'):
36
- controller.model.cleanup()
37
-
38
- # Remove from tracking dictionary
39
- if tab_text in self._tabs:
40
- del self._tabs[tab_text]
41
-
42
- # Remove the tab
43
- self.removeTab(index)
44
-
45
- # Emit signal before deletion
46
- self.tab_closed.emit(widget)
47
-
48
- # Schedule widget for deletion
49
- widget.deleteLater()
50
-
51
- # Update all remaining tabs
52
- self._update_all_tabs()
53
- except Exception as e:
54
- print(f"Error closing tab: {e}")
55
-
56
- def addTab(self, widget, label):
57
- try:
58
- if widget and label:
59
- # Store widget reference with unique identifier
60
- tab_id = f"{label}_{id(widget)}"
61
- self._tabs[tab_id] = {
62
- 'widget': widget,
63
- 'label': label,
64
- 'close_button': None
65
- }
66
-
67
- # Add the tab
68
- index = super().addTab(widget, label)
69
-
70
- if index != 0:
71
- # Create and setup close button
72
- close_button = self._create_close_button(index, label)
73
- self._tabs[tab_id]['close_button'] = close_button
74
- self.tabBar().setTabButton(index, QTabBar.ButtonPosition.RightSide, close_button)
75
-
76
- return index
77
- except Exception as e:
78
- print(f"Error adding tab: {e}")
79
- return -1
80
-
81
- def _create_close_button(self, index, label):
82
- """Create a new close button for a tab"""
83
- close_button = QToolButton(self.tabBar())
84
- close_button.setObjectName(f"close_button_{label}")
85
- close_icon = self.style().standardIcon(QtWidgets.QStyle.StandardPixmap.SP_TitleBarCloseButton)
86
- close_button.setIcon(close_icon)
87
- close_button.setIconSize(QSize(16, 16))
88
- close_button.setAutoRaise(True)
89
- close_button.setStyleSheet("""
90
- QToolButton {
91
- border: none;
92
- padding: 0px;
93
- }
94
- QToolButton:hover {
95
- background: #c42b1c;
96
- }
97
- """)
98
- close_button.setCursor(QCursor(Qt.CursorShape.PointingHandCursor))
99
- close_button.setFixedSize(18, 18)
100
- close_button.clicked.connect(lambda checked, idx=index: self.safely_close_tab(idx))
101
- return close_button
102
-
103
- def safely_close_tab(self, index):
104
- """Safely handle tab closure with error checking"""
105
- try:
106
- if 0 <= index < self.count():
107
- current_widget = self.widget(index)
108
- if current_widget and index != 0:
109
- self.closeTab(index)
110
- except Exception as e:
111
- print(f"Error in safely_close_tab: {e}")
112
-
113
- def _handle_tab_moved(self, from_index: int, to_index: int):
114
- """Handle tab movement and update close buttons"""
115
- try:
116
- self._update_all_tabs()
117
- except Exception as e:
118
- print(f"Error handling tab movement: {e}")
119
-
120
- def _update_all_tabs(self):
121
- """Update all tabs and their close buttons"""
122
- try:
123
- for i in range(1, self.count()): # Skip index 0 (home tab)
124
- widget = self.widget(i)
125
- if widget:
126
- label = self.tabText(i)
127
- tab_id = f"{label}_{id(widget)}"
128
-
129
- # Create new close button if needed
130
- if tab_id not in self._tabs or not self._tabs[tab_id].get('close_button'):
131
- close_button = self._create_close_button(i, label)
132
- self._tabs[tab_id] = {
133
- 'widget': widget,
134
- 'label': label,
135
- 'close_button': close_button
136
- }
137
- self.tabBar().setTabButton(i, QTabBar.ButtonPosition.RightSide, close_button)
138
- else:
139
- # Update existing close button's click connection
140
- close_button = self._tabs[tab_id]['close_button']
141
- close_button.clicked.disconnect()
142
- close_button.clicked.connect(lambda checked, idx=i: self.safely_close_tab(idx))
143
- except Exception as e:
144
- print(f"Error updating tabs: {e}")
145
-
146
- def moveTab(self, from_index, to_index):
147
- """Override moveTab to safely handle tab movement"""
148
- try:
149
- if (0 <= from_index < self.count() and
150
- 0 <= to_index < self.count() and
151
- from_index != 0 and
152
- to_index != 0):
153
-
154
- super().moveTab(from_index, to_index)
155
- self._update_all_tabs()
156
-
157
- except Exception as e:
158
- print(f"Error moving tab: {e}")
159
-
160
-
161
- class MainWindowView(QMainWindow):
162
  def __init__(self, global_settings):
163
- super().__init__()
164
- self.global_settings = global_settings
165
- self.logger = global_settings.get_logger()
166
  self._init_ui()
167
  self.oldPos = None
168
 
169
  def _init_ui(self) -> None:
170
- # Hide the window and disable updates during initialization
171
- self.hide()
172
- self.setUpdatesEnabled(False)
173
- try:
174
- # Calculate center position first
175
- screen = QtGui.QGuiApplication.primaryScreen()
176
- screen_geometry = screen.geometry()
177
- centerPoint = screen_geometry.center()
178
-
179
- # Load and initialize UI
180
- self._load_ui_file()
181
- self._init_window_properties()
182
- self._init_ui_elements()
183
- self.apply_theme()
184
- self._scale_ui()
185
-
186
- # Get final size
187
- final_size = self.size()
188
-
189
- # Calculate position only once
190
- x = centerPoint.x() - (final_size.width() // 2)
191
- y = centerPoint.y() - (final_size.height() // 2)
192
-
193
- # Set position and size in a single operation
194
- self.setGeometry(x, y, final_size.width(), final_size.height())
195
-
196
- # Re-enable updates and show window
197
- self.setUpdatesEnabled(True)
198
- self.show()
199
- self.repaint() # Force immediate repaint
200
-
201
- self.logger.debug(f"Window initialized at position ({x}, {y}) with size {final_size}")
202
- except Exception as e:
203
- self._handle_init_error(e)
204
-
205
- def _load_ui_file(self) -> None:
206
- ui_file = os.path.join(self.global_settings.get_ui_dir_path(), "main_window.ui")
207
- uic.loadUi(ui_file, self)
208
 
209
  def _init_window_properties(self) -> None:
210
- """
211
- Creates a frameless, translucent window without a toolbar.
212
- """
213
- # Set window flags before other properties
214
  self.setWindowFlags(Qt.WindowType.FramelessWindowHint)
215
  self.setAttribute(Qt.WidgetAttribute.WA_TranslucentBackground)
216
  self.setAttribute(Qt.WidgetAttribute.WA_NoSystemBackground)
217
- # Hide toolbars
218
  toolbars = self.findChildren(QtWidgets.QToolBar)
219
  for toolbar in toolbars:
220
  toolbar.hide()
221
- # Ensure window starts hidden
222
- self.setVisible(False)
223
 
224
  def _init_ui_elements(self) -> None:
225
- # Initialize menu bar and custom title bar
226
  self._init_menuBar()
227
  self._init_custom_title_bar()
228
 
229
- # Create main widget and layout
230
  main_widget = QWidget()
231
  main_layout = QVBoxLayout(main_widget)
232
  main_layout.setContentsMargins(0, 0, 0, 0)
233
  main_layout.setSpacing(0)
234
 
235
- # Add title bar and divider
236
  main_layout.addWidget(self.title_bar, 0)
237
  main_layout.addWidget(self._init_divider(), 0)
238
 
@@ -242,16 +72,14 @@ class MainWindowView(QMainWindow):
242
  tab_container_layout.setContentsMargins(0, 0, 0, 0)
243
  tab_container_layout.setSpacing(0)
244
 
245
- # Use the _add_new_tab method to add a new tab
246
- # self._add_new_tab()
247
-
248
  # Initialize and add CloseableTabWidget
249
  self.tab_widget = CloseableTabWidget(self)
250
- self.tab_widget.setSizePolicy(QtWidgets.QSizePolicy.Policy.Expanding, QtWidgets.QSizePolicy.Policy.Expanding)
 
251
  self.tab_widget.setStyleSheet("""
252
  QTabWidget::pane {
253
  border: 1px solid #444444;
254
- padding: 10px; /* Add padding here */
255
  }
256
  """)
257
  tab_container_layout.addWidget(self.tab_widget)
@@ -271,40 +99,35 @@ class MainWindowView(QMainWindow):
271
  self.action_open_NCBI = self._find_widget("actGoToNCBI", QAction)
272
 
273
  def _find_widget(self, name: str, widget_type: type) -> Optional[QtWidgets.QWidget]:
 
274
  widget = self.findChild(widget_type, name)
275
  if widget is None:
276
- self.global_settings.logger.warning(f"Widget '{name}' not found in UI file.")
277
  return widget
278
-
279
  def _init_custom_title_bar(self) -> None:
280
  self.title_bar = QWidget(self)
281
  self.title_bar.setObjectName("custom_title_bar")
282
- self.title_bar.setFixedHeight(32) # Reduced height
283
 
284
  # Create the main horizontal layout for the title bar
285
  layout = QHBoxLayout(self.title_bar)
286
- layout.setContentsMargins(10, 0, 10, 0) # Equal margins on left and right
287
- layout.setSpacing(5) # Reduced spacing between items
288
 
289
  # ----- Window Control Buttons -----
290
- button_font = QFont("Arial", 8)
291
-
292
  self.minimize_window_button = QPushButton("-", self.title_bar)
293
  self.minimize_window_button.setObjectName("minimize_window_button")
294
  self.minimize_window_button.setFixedSize(20, 20)
295
- self.minimize_window_button.setFont(button_font)
296
 
297
  self.maximize_window_button = QPushButton("⛶", self.title_bar)
298
  self.maximize_window_button.setObjectName("maximize_window_button")
299
  self.maximize_window_button.setFixedSize(20, 20)
300
- self.maximize_window_button.setFont(button_font)
301
 
302
  self.close_window_button = QPushButton("✕", self.title_bar)
303
  self.close_window_button.setObjectName("close_window_button")
304
  self.close_window_button.setFixedSize(20, 20)
305
- self.close_window_button.setFont(button_font)
306
 
307
- # Apply a style to center the text vertically and horizontally
308
  button_style = """
309
  QPushButton {
310
  padding: 0px;
@@ -343,7 +166,6 @@ class MainWindowView(QMainWindow):
343
  right_layout.addStretch()
344
  right_layout.addWidget(self.theme_toggle_button)
345
 
346
- # ----- Synchronize Widths of Left and Right Widgets -----
347
  # Adjust left_widget to calculate its required width
348
  left_widget.adjustSize()
349
  left_width = left_widget.sizeHint().width()
@@ -351,10 +173,8 @@ class MainWindowView(QMainWindow):
351
  # Set right_widget's fixed width to match left_widget's width
352
  right_widget.setFixedWidth(left_width)
353
 
354
- # ----- Title Label -----
355
  self.title_label = QLabel("CASPER", self.title_bar)
356
  self.title_label.setObjectName("title_label")
357
- self.title_label.setFont(QFont("Arial", 10, QFont.Weight.Bold))
358
  self.title_label.setAlignment(Qt.AlignmentFlag.AlignCenter) # Center the text in the label
359
 
360
  # Add Widgets to the Main Title Bar Layout
@@ -364,6 +184,11 @@ class MainWindowView(QMainWindow):
364
  layout.addStretch(1)
365
  layout.addWidget(right_widget)
366
 
 
 
 
 
 
367
 
368
  def _init_divider(self):
369
  divider = QFrame()
@@ -372,85 +197,26 @@ class MainWindowView(QMainWindow):
372
  divider.setFrameShadow(QFrame.Shadow.Sunken)
373
  return divider
374
 
375
- # def _add_new_tab(self):
376
- # new_tab = QWidget()
377
- # layout = QVBoxLayout(new_tab)
378
-
379
- # label = QLabel("This is a new tab", new_tab)
380
- # layout.addWidget(label)
381
-
382
- # new_tab_button = QPushButton("Open New Tab", new_tab)
383
- # new_tab_button.clicked.connect(self._add_new_tab)
384
- # layout.addWidget(new_tab_button)
385
- # tab_index = self.tab_widget.addTab(new_tab, f"Tab {self.tab_widget.count() + 1}")
386
- # self.tab_widget.setCurrentIndex(tab_index)
387
-
388
-
389
- def _scale_ui(self):
390
- """Modified scale_ui to only handle sizing, not positioning"""
391
- try:
392
- screen = QtGui.QGuiApplication.primaryScreen()
393
- screen_geometry = screen.geometry()
394
- width = screen_geometry.width()
395
- height = screen_geometry.height()
396
-
397
- # Font scaling
398
- self.centralWidget().setStyleSheet(f"font: 12pt 'Arial';")
399
-
400
- if hasattr(self, 'title'):
401
- scaled_title_font_size = int(30 * (width / 1920))
402
- self.title.setStyleSheet(f"font: bold {scaled_title_font_size}pt 'Arial';")
403
-
404
- # Calculate size only
405
- scaledWidth = int((width * 575) / 1920)
406
- scaledHeight = int((height * 400) / 1080)
407
-
408
- # Ensure minimum size
409
- self.adjustSize()
410
- currentWidth = self.size().width()
411
- currentHeight = self.size().height()
412
-
413
- if scaledHeight < currentHeight:
414
- scaledHeight = currentHeight
415
- if scaledWidth < currentWidth:
416
- scaledWidth = currentWidth
417
-
418
- # Only resize, don't reposition
419
- self.resize(scaledWidth, scaledHeight)
420
-
421
- except Exception as e:
422
- self.logger.error(f"Error in _scale_ui: {str(e)}")
423
-
424
  def _handle_init_error(self, e: Exception) -> None:
425
  error_msg = f"Error initializing MainWindowView: {str(e)}"
426
- self.global_settings.logger.error(error_msg, exc_info=True)
427
- show_error(self.global_settings, "Initialization Error", error_msg)
428
  raise
429
 
430
  def update_theme_icon(self) -> None:
431
- icon_name = "dark_mode.png" if self.global_settings.get_theme() == "dark" else "light_mode.png"
432
- icon_path = os.path.join(self.global_settings.get_assets_dir_path(), icon_name)
433
- icon = QIcon(icon_path)
434
- self.theme_toggle_button.setIcon(icon)
435
- self.theme_toggle_button.setIconSize(QtCore.QSize(16, 16))
436
-
437
- def mousePressEvent(self, event):
438
- if event.button() == Qt.MouseButton.LeftButton:
439
- self.oldPos = event.globalPosition().toPoint()
440
-
441
- def mouseMoveEvent(self, event):
442
- if self.oldPos:
443
- delta = event.globalPosition().toPoint() - self.oldPos
444
- self.move(self.x() + delta.x(), self.y() + delta.y())
445
- self.oldPos = event.globalPosition().toPoint()
446
-
447
- def mouseReleaseEvent(self, event):
448
- if event.button() == Qt.MouseButton.LeftButton:
449
- self.oldPos = None
450
 
451
  def resizeEvent(self, event):
452
  super().resizeEvent(event)
453
- self.logger.debug(f"Window resized. New size: {self.size()}")
454
 
455
  def apply_theme(self):
456
  themes = {
@@ -488,11 +254,8 @@ class MainWindowView(QMainWindow):
488
  }
489
  }
490
 
491
- # Get the current theme
492
- current_theme = self.global_settings.get_theme()
493
  theme = themes["dark"] if current_theme == "dark" else themes["light"]
494
-
495
- # Apply the selected theme using qdarktheme
496
  qdarktheme.setup_theme(current_theme)
497
 
498
  # Set the stylesheet
@@ -542,13 +305,19 @@ class MainWindowView(QMainWindow):
542
  }}
543
  """)
544
 
545
- def show_window(self) -> None:
546
- """Shows the window without repositioning"""
547
- self.show()
548
- self.repaint()
549
-
550
-
551
-
552
-
553
 
 
 
 
 
 
 
554
 
 
 
 
 
 
1
  from PyQt6.QtWidgets import (
2
+ QMainWindow, QPushButton, QWidget, QVBoxLayout,
3
+ QHBoxLayout, QLabel, QFrame,
 
4
  )
5
+ from PyQt6.QtGui import QIcon, QAction
6
+ from PyQt6.QtCore import Qt
7
  from PyQt6 import uic, QtWidgets, QtCore, QtGui
8
+ from utils.ui import show_error
9
+ from utils.LoggingMixin import LoggingMixin
10
  import os
11
  from typing import Optional
 
12
  import qdarktheme
13
+ from views.CloseableTabWidget import CloseableTabWidget
14
 
15
+ class MainWindowView(QMainWindow, LoggingMixin):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def __init__(self, global_settings):
17
+ QMainWindow.__init__(self)
18
+ LoggingMixin.__init__(self)
19
+ self.settings = global_settings
20
  self._init_ui()
21
  self.oldPos = None
22
 
23
  def _init_ui(self) -> None:
24
+ self.log_method_call("_init_ui")
25
+
26
+ screen = QtGui.QGuiApplication.primaryScreen()
27
+ screen_geometry = screen.geometry()
28
+ centerPoint = screen_geometry.center()
29
+
30
+ # Load UI file
31
+ uic.loadUi(self.settings.get_ui_dir_path() + "/main_window.ui", self)
32
+ self._init_window_properties()
33
+ self._init_ui_elements()
34
+ self.apply_theme()
35
+
36
+ # Calculate and set position
37
+ final_size = self.size()
38
+ x = centerPoint.x() - (final_size.width() // 2)
39
+ y = centerPoint.y() - (final_size.height() // 2)
40
+
41
+ self.setGeometry(x, y, final_size.width(), final_size.height())
42
+ self.setUpdatesEnabled(True)
43
+ self.show()
44
+ self.repaint()
45
+
46
+ self.log_debug(f"Window initialized at position ({x}, {y}) with size {final_size}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  def _init_window_properties(self) -> None:
 
 
 
 
49
  self.setWindowFlags(Qt.WindowType.FramelessWindowHint)
50
  self.setAttribute(Qt.WidgetAttribute.WA_TranslucentBackground)
51
  self.setAttribute(Qt.WidgetAttribute.WA_NoSystemBackground)
52
+
53
  toolbars = self.findChildren(QtWidgets.QToolBar)
54
  for toolbar in toolbars:
55
  toolbar.hide()
 
 
56
 
57
  def _init_ui_elements(self) -> None:
 
58
  self._init_menuBar()
59
  self._init_custom_title_bar()
60
 
 
61
  main_widget = QWidget()
62
  main_layout = QVBoxLayout(main_widget)
63
  main_layout.setContentsMargins(0, 0, 0, 0)
64
  main_layout.setSpacing(0)
65
 
 
66
  main_layout.addWidget(self.title_bar, 0)
67
  main_layout.addWidget(self._init_divider(), 0)
68
 
 
72
  tab_container_layout.setContentsMargins(0, 0, 0, 0)
73
  tab_container_layout.setSpacing(0)
74
 
 
 
 
75
  # Initialize and add CloseableTabWidget
76
  self.tab_widget = CloseableTabWidget(self)
77
+ self.tab_widget.setSizePolicy(QtWidgets.QSizePolicy.Policy.Expanding,
78
+ QtWidgets.QSizePolicy.Policy.Expanding)
79
  self.tab_widget.setStyleSheet("""
80
  QTabWidget::pane {
81
  border: 1px solid #444444;
82
+ padding: 10px;
83
  }
84
  """)
85
  tab_container_layout.addWidget(self.tab_widget)
 
99
  self.action_open_NCBI = self._find_widget("actGoToNCBI", QAction)
100
 
101
  def _find_widget(self, name: str, widget_type: type) -> Optional[QtWidgets.QWidget]:
102
+ """Find a widget by name and type"""
103
  widget = self.findChild(widget_type, name)
104
  if widget is None:
105
+ self.log_warning(f"Widget '{name}' not found in UI file")
106
  return widget
107
+
108
  def _init_custom_title_bar(self) -> None:
109
  self.title_bar = QWidget(self)
110
  self.title_bar.setObjectName("custom_title_bar")
111
+ self.title_bar.setFixedHeight(32)
112
 
113
  # Create the main horizontal layout for the title bar
114
  layout = QHBoxLayout(self.title_bar)
115
+ layout.setContentsMargins(10, 0, 10, 0)
116
+ layout.setSpacing(5)
117
 
118
  # ----- Window Control Buttons -----
 
 
119
  self.minimize_window_button = QPushButton("-", self.title_bar)
120
  self.minimize_window_button.setObjectName("minimize_window_button")
121
  self.minimize_window_button.setFixedSize(20, 20)
 
122
 
123
  self.maximize_window_button = QPushButton("⛶", self.title_bar)
124
  self.maximize_window_button.setObjectName("maximize_window_button")
125
  self.maximize_window_button.setFixedSize(20, 20)
 
126
 
127
  self.close_window_button = QPushButton("✕", self.title_bar)
128
  self.close_window_button.setObjectName("close_window_button")
129
  self.close_window_button.setFixedSize(20, 20)
 
130
 
 
131
  button_style = """
132
  QPushButton {
133
  padding: 0px;
 
166
  right_layout.addStretch()
167
  right_layout.addWidget(self.theme_toggle_button)
168
 
 
169
  # Adjust left_widget to calculate its required width
170
  left_widget.adjustSize()
171
  left_width = left_widget.sizeHint().width()
 
173
  # Set right_widget's fixed width to match left_widget's width
174
  right_widget.setFixedWidth(left_width)
175
 
 
176
  self.title_label = QLabel("CASPER", self.title_bar)
177
  self.title_label.setObjectName("title_label")
 
178
  self.title_label.setAlignment(Qt.AlignmentFlag.AlignCenter) # Center the text in the label
179
 
180
  # Add Widgets to the Main Title Bar Layout
 
184
  layout.addStretch(1)
185
  layout.addWidget(right_widget)
186
 
187
+ # Add mouse tracking to the title bar
188
+ self.title_bar.mousePressEvent = self.mousePressEvent
189
+ self.title_bar.mouseMoveEvent = self.mouseMoveEvent
190
+ self.title_bar.mouseReleaseEvent = self.mouseReleaseEvent
191
+ self.title_bar.setMouseTracking(True)
192
 
193
  def _init_divider(self):
194
  divider = QFrame()
 
197
  divider.setFrameShadow(QFrame.Shadow.Sunken)
198
  return divider
199
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  def _handle_init_error(self, e: Exception) -> None:
201
  error_msg = f"Error initializing MainWindowView: {str(e)}"
202
+ self.log_error("_init_ui", e)
203
+ show_error(self.settings, "Initialization Error", error_msg)
204
  raise
205
 
206
  def update_theme_icon(self) -> None:
207
+ try:
208
+ icon_name = "dark_mode.png" if self.settings.get_theme() == "dark" else "light_mode.png"
209
+ icon_path = os.path.join(self.settings.get_assets_dir_path(), icon_name)
210
+ icon = QIcon(icon_path)
211
+ self.theme_toggle_button.setIcon(icon)
212
+ self.theme_toggle_button.setIconSize(QtCore.QSize(16, 16))
213
+ except Exception as e:
214
+ self.log_error("update_theme_icon", e)
215
+ show_error(self.settings, "Theme Error", "Failed to update theme icon")
 
 
 
 
 
 
 
 
 
 
216
 
217
  def resizeEvent(self, event):
218
  super().resizeEvent(event)
219
+ self.log_debug(f"Window resized. New size: {self.size()}")
220
 
221
  def apply_theme(self):
222
  themes = {
 
254
  }
255
  }
256
 
257
+ current_theme = self.settings.get_theme()
 
258
  theme = themes["dark"] if current_theme == "dark" else themes["light"]
 
 
259
  qdarktheme.setup_theme(current_theme)
260
 
261
  # Set the stylesheet
 
305
  }}
306
  """)
307
 
308
+ def mousePressEvent(self, event):
309
+ """Handle mouse press events for window dragging"""
310
+ if event.button() == Qt.MouseButton.LeftButton:
311
+ self.oldPos = event.globalPosition().toPoint()
 
 
 
 
312
 
313
+ def mouseMoveEvent(self, event):
314
+ """Handle mouse move events for window dragging"""
315
+ if self.oldPos is not None:
316
+ delta = event.globalPosition().toPoint() - self.oldPos
317
+ self.move(self.x() + delta.x(), self.y() + delta.y())
318
+ self.oldPos = event.globalPosition().toPoint()
319
 
320
+ def mouseReleaseEvent(self, event):
321
+ """Handle mouse release events for window dragging"""
322
+ if event.button() == Qt.MouseButton.LeftButton:
323
+ self.oldPos = None
src/views/MultitargetingWindowView.py CHANGED
@@ -1,12 +1,11 @@
1
  from typing import Optional
2
  from PyQt6 import QtWidgets, uic, QtGui
3
  from PyQt6.QtWidgets import QTableWidgetItem, QAbstractItemView
4
- from PyQt6.QtGui import QIcon
5
  from PyQt6.QtCore import Qt
6
  from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg, NavigationToolbar2QT
7
  from matplotlib.figure import Figure
8
  from matplotlib.ticker import MaxNLocator
9
- from utils.ui import show_error, scale_ui
10
 
11
  class MultitargetingWindowView(QtWidgets.QMainWindow):
12
  def __init__(self, global_settings):
 
1
  from typing import Optional
2
  from PyQt6 import QtWidgets, uic, QtGui
3
  from PyQt6.QtWidgets import QTableWidgetItem, QAbstractItemView
 
4
  from PyQt6.QtCore import Qt
5
  from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg, NavigationToolbar2QT
6
  from matplotlib.figure import Figure
7
  from matplotlib.ticker import MaxNLocator
8
+ from utils.ui import show_error
9
 
10
  class MultitargetingWindowView(QtWidgets.QMainWindow):
11
  def __init__(self, global_settings):
src/views/NewEndonuclease.py DELETED
@@ -1,228 +0,0 @@
1
- import sys, os
2
- from PyQt5 import QtWidgets, uic, QtGui, QtCore, Qt
3
- import models.GlobalSettings as GlobalSettings
4
- from PyQt5.QtGui import QIntValidator
5
- import traceback
6
- import math
7
- from utils.ui import show_message, show_error, scale_ui, center_ui
8
-
9
- logger = GlobalSettings.logger
10
-
11
- class NewEndonuclease(QtWidgets.QMainWindow):
12
- def __init__(self):
13
- print("Initializing NewEndonuclease class")
14
- try:
15
- super(NewEndonuclease, self).__init__()
16
- uic.loadUi(GlobalSettings.appdir + 'ui/newendonuclease.ui', self)
17
- self.setWindowIcon(Qt.QIcon(GlobalSettings.appdir + "cas9image.ico"))
18
- self.setWindowTitle('New Endonuclease')
19
- self.error = False
20
- pamFlag = False
21
-
22
- self.onList = []
23
- self.offList = []
24
-
25
- self.onList, self.offList = self.get_on_off_data() ### Call function to fill on- and off- data name lists
26
-
27
- for name in self.onList: ### Add on-target names to drop-down
28
- self.comboBox.addItem(str(name))
29
-
30
- for name in self.offList: ### Add off-target names to drop-down
31
- self.comboBox_2.addItem(str(name))
32
-
33
- self.submit_button.clicked.connect(self.submit)
34
- self.cancel_button.clicked.connect(self.cancel)
35
-
36
- ### Set up validators for input fields:
37
- reg_ex1 = QtCore.QRegExp("[^/\\\\_]+") # No slashes or underscores
38
- reg_ex2 = QtCore.QRegExp("[^/\\\\_\\s]+") # No slashes, underscores, or spaces
39
- reg_ex3 = QtCore.QRegExp("[acdefghiklmnpqrstvwyACDEFGHIKLMNPQRSTVWY\S]+") # Only approved PAM characters and no spaces
40
- input_validator1 = QtGui.QRegExpValidator(reg_ex1, self)
41
- input_validator2 = QtGui.QRegExpValidator(reg_ex2, self)
42
- input_validator3 = QtGui.QRegExpValidator(reg_ex3, self)
43
- self.organism_name.setValidator(input_validator1)
44
- self.abbreviation.setValidator(input_validator2)
45
- self.pam_sequence.setValidator(input_validator3)
46
-
47
- self.seed_length.setValidator(QIntValidator(0,30,self.seed_length))
48
- self.five_length.setValidator(QIntValidator(0,20,self.five_length))
49
- self.three_length.setValidator(QIntValidator(0,20,self.three_length))
50
-
51
- groupbox_style = """
52
- QGroupBox:title{subcontrol-origin: margin;
53
- left: 10px;
54
- padding: 0 5px 0 5px;}
55
- QGroupBox#groupBox{border: 2px solid rgb(111,181,110);
56
- border-radius: 9px;
57
- font: bold 14pt 'Arial';
58
- margin-top: 10px;}"""
59
-
60
- self.groupBox.setStyleSheet(groupbox_style)
61
- self.groupBox_2.setStyleSheet(groupbox_style.replace("groupBox","groupBox_2"))
62
- self.groupBox_3.setStyleSheet(groupbox_style.replace("groupBox","groupBox_3"))
63
-
64
- scale_ui(self, custom_scale_width=480, custom_scale_height=615)
65
- except Exception as e:
66
- show_error("Error initializing NewEndonuclease class.", e)
67
-
68
- #helper function for writing new endo information to CASPERinfo - used by submit()
69
- def writeNewEndonuclease(self, newEndonucleaseStr):
70
- try:
71
- with open(GlobalSettings.appdir + 'CASPERinfo', 'r') as f, open(GlobalSettings.appdir + "new_file", 'w+') as f1:
72
- for line in f:
73
- f1.write(line)
74
- if 'ENDONUCLEASES' in line:
75
- f1.write(newEndonucleaseStr + '\n') # Move f1.write(line) above, to write above instead
76
- os.remove(GlobalSettings.appdir + "CASPERinfo")
77
- os.rename(GlobalSettings.appdir + "new_file",
78
- GlobalSettings.appdir + "CASPERinfo") # Rename the new file
79
- except Exception as e:
80
- show_error("Error in writeNewEndonuclease() in New Endonuclease.", e)
81
-
82
- #submit new endo to CASPERinfo file
83
- def submit(self):
84
- try:
85
- # This is executed when the button is pressed
86
- name = str(self.organism_name.text())
87
- abbr = str(self.abbreviation.text())
88
- crisprtype = str(self.crispr_type.text())
89
- seed_len = str(self.seed_length.text())
90
- five_len = str(self.five_length.text())
91
- three_len = str(self.three_length.text())
92
- pam = str(self.pam_sequence.text()).upper()
93
- ### Check for multiple PAMs and format if present
94
- if len(pam.split(','))>0:
95
- pam = [x.strip() for x in pam.split(',')]
96
- pam = ",".join(pam)
97
- ### Check for PAM directionality
98
- if self.five_pam.isChecked():
99
- pam_dir = str(5)
100
- else:
101
- pam_dir = str(3)
102
- on_scoring = str(self.comboBox.currentText())
103
- off_scoring = str(self.comboBox_2.currentText())
104
- length = len(seed_len) + len(five_len) + len(three_len)
105
- argument_list = [abbr, pam, five_len, seed_len, three_len, pam_dir, name, crisprtype, on_scoring, off_scoring]
106
- validPAM = ('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y')
107
- self.error = False;
108
-
109
- ### Error checking for PAM alphabet
110
- for letter in pam:
111
- if (letter not in validPAM):
112
- show_message(
113
- fontSize=12,
114
- icon=QtWidgets.QMessageBox.Icon.Critical,
115
- title="Invalid PAM",
116
- message="Invalid characters in PAM Sequence."
117
- )
118
- return True
119
- ### Error checking for filling out all fields
120
- for arg in argument_list:
121
- if ';' in arg:
122
- show_message(
123
- fontSize=12,
124
- icon=QtWidgets.QMessageBox.Icon.Critical,
125
- title="Invalid Semicolon",
126
- message="Invalid character used: ';'."
127
- )
128
- return True
129
- elif arg == "":
130
- show_message(
131
- fontSize=12,
132
- icon=QtWidgets.QMessageBox.Icon.Critical,
133
- title="Empty Field",
134
- message="Please fill in all fields."
135
- )
136
- return True
137
- else:
138
- pass
139
-
140
- ### Check for duplicate endo abbreviations
141
- for key in GlobalSettings.mainWindow.organisms_to_endos:
142
- endo = GlobalSettings.mainWindow.organisms_to_endos[key]
143
- if abbr in endo:
144
- show_message(
145
- fontSize=12,
146
- icon=QtWidgets.QMessageBox.Icon.Critical,
147
- title="Duplicate endo name.",
148
- message="The given abbreviation already exists. Please choose a unique identifier."
149
- )
150
- return True
151
- else:
152
- pass
153
-
154
- myString = ""
155
- for i, arg in enumerate(argument_list):
156
- if i == len(argument_list)-1: ### Last argument in list
157
- myString += str(arg)
158
- else:
159
- myString += str(arg) + ";"
160
-
161
- self.writeNewEndonuclease(myString)
162
-
163
- ### Refresh endonuclease dropdown in New Genome
164
- GlobalSettings.mainWindow.newGenome.fillEndo()
165
-
166
- self.clear_all()
167
- self.close()
168
- except Exception as e:
169
- show_error("Error in submit() in New Endonuclease.", e)
170
-
171
- #cancel and close window
172
- def cancel(self):
173
- try:
174
- self.clear_all()
175
- self.close()
176
- except Exception as e:
177
- show_error("Error in cancel() in New Endonuclease.", e)
178
-
179
- # This function clears all of the line edits
180
- def clear_all(self):
181
- try:
182
- self.organism_name.clear()
183
- self.abbreviation.clear()
184
- self.crispr_type.clear()
185
- self.seed_length.clear()
186
- self.five_length.clear()
187
- self.three_length.clear()
188
- self.pam_sequence.clear()
189
- except Exception as e:
190
- show_error("Error in clear_all() in New Endonuclease.", e)
191
-
192
- # This function parses CASPERinfo to return the names (in lists) of all on-target and off-target scoring data
193
- def get_on_off_data(self):
194
- try:
195
- filename = GlobalSettings.appdir + "CASPERinfo"
196
- retList_on = []
197
- retList_off = []
198
- with open(filename, 'r') as f:
199
- lines = f.readlines()
200
- for i, line in enumerate(lines):
201
- line = str(line)
202
- if "ON-TARGET DATA" in line:
203
- index = i
204
- while "-----" not in line:
205
- if "DATA:" in line:
206
- retList_on.append(line.split("DATA:")[-1].strip()) ### Append name of scoring data to on-target name list
207
- line = lines[index+1]
208
- index += 1
209
- else:
210
- line = lines[index+1]
211
- index += 1
212
- continue
213
- elif "OFF-TARGET MATRICES" in line:
214
- index = i
215
- while "-----" not in line:
216
- if "MATRIX:" in line:
217
- retList_off.append(line.split("MATRIX:")[-1].strip()) ### Append name of scoring data to off-target name list
218
- line = lines[index+1]
219
- index += 1
220
- else:
221
- line = lines[index+1]
222
- index += 1
223
- continue
224
- else:
225
- continue
226
- return retList_on, retList_off
227
- except Exception as e:
228
- show_error("Error in get_on_off_data() in New Endonuclease.", e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/views/NewGenome.py DELETED
@@ -1,705 +0,0 @@
1
- from ast import Global
2
- import os
3
- from PyQt5 import QtWidgets, uic, QtGui, QtCore, Qt
4
- import models.GlobalSettings as GlobalSettings
5
- from functools import partial
6
- from utils.Algorithms import SeqTranslate
7
- import webbrowser
8
- import platform
9
- import traceback
10
- import math
11
- from utils.ui import show_message, show_error, scale_ui, center_ui
12
- from utils.web import ncbi_page, repo_page
13
-
14
- logger = GlobalSettings.logger
15
-
16
- def iter_except(function, exception):
17
- """Works like builtin 2-argument `iter()`, but stops on `exception`."""
18
- try:
19
- while True:
20
- yield function()
21
- except exception:
22
- return
23
-
24
- #UI prompt for when the user has finished running jobs in new genome to allow them to choose where the want to proceed
25
- class goToPrompt(QtWidgets.QMainWindow):
26
- def __init__(self):
27
- try:
28
- super(goToPrompt, self).__init__()
29
- uic.loadUi(GlobalSettings.appdir + 'ui/newgenomenavigationpage.ui', self)
30
-
31
- groupbox_style = """
32
- QGroupBox:title{subcontrol-origin: margin;
33
- left: 10px;
34
- padding: 0 5px 0 5px;}
35
- QGroupBox#groupBox{border: 2px solid rgb(111,181,110);
36
- border-radius: 9px;
37
- font: bold 14pt 'Arial';
38
- margin-top: 10px;}"""
39
- self.groupBox.setStyleSheet(groupbox_style)
40
- scale_ui(self, custom_scale_width=575, custom_scale_height=175)
41
- self.setWindowTitle("New Genome")
42
- self.setWindowIcon(Qt.QIcon(GlobalSettings.appdir + "cas9image.ico"))
43
- self.hide()
44
-
45
- except Exception as e:
46
- show_error("Unable to initialize goToPrompt class in New Genome.", e)
47
-
48
- #New genome class to allow users to generate new CSPR files
49
- class NewGenome(QtWidgets.QMainWindow):
50
- def __init__(self, info_path):
51
- try:
52
- super(NewGenome, self).__init__()
53
- uic.loadUi(GlobalSettings.appdir + 'ui/NewGenome.ui', self)
54
- self.setWindowTitle('New Genome')
55
- self.setWindowTitle('New Genome')
56
- self.info_path = info_path
57
-
58
- #---Style Modifications---#
59
-
60
- groupbox_style = """
61
- QGroupBox:title{subcontrol-origin: margin;
62
- left: 10px;
63
- padding: 0 5px 0 5px;}
64
- QGroupBox#Step1{border: 2px solid rgb(111,181,110);
65
- border-radius: 9px;
66
- font: bold 14pt 'Arial';
67
- margin-top: 10px;}"""
68
-
69
- self.Step1.setStyleSheet(groupbox_style)
70
- self.Step2.setStyleSheet(groupbox_style.replace("Step1","Step2"))
71
- self.Step3.setStyleSheet(groupbox_style.replace("Step1","Step3"))
72
-
73
- #---Button Modifications---#
74
-
75
- self.setWindowIcon(Qt.QIcon(GlobalSettings.appdir + "cas9image.ico"))
76
- self.resetButton.clicked.connect(self.reset)
77
- self.submitButton.clicked.connect(self.submit)
78
- self.browseForFile.clicked.connect(self.selectFasta)
79
- self.remove_job.clicked.connect(self.remove_from_queue)
80
- self.output_browser.setText("Waiting for program initiation...")
81
- self.contButton.clicked.connect(self.continue_to_main)
82
-
83
- self.comboBoxEndo.currentIndexChanged.connect(self.endo_settings)
84
-
85
- self.runButton.clicked.connect(self.run_jobs_wrapper)
86
- self.clearButton.clicked.connect(self.clear_all)
87
-
88
- self.JobsQueue = [] # holds Job classes.
89
- self.check_strings = []
90
- self.Endos = dict()
91
- self.file = ""
92
-
93
- self.process = QtCore.QProcess()
94
- self.process.setProcessChannelMode(QtCore.QProcess.MergedChannels)
95
- self.process.finished.connect(self.upon_process_finishing)
96
- self.seqTrans = SeqTranslate()
97
- self.exit = False
98
-
99
- self.first = False
100
- #show functionalities on window
101
- self.fillEndo()
102
-
103
- self.num_chromo_next = False
104
-
105
- #Jobs Table
106
- self.job_Table.setShowGrid(False)
107
- self.job_Table.horizontalHeader().setSectionsClickable(True)
108
- self.job_Table.setSelectionBehavior(QtWidgets.QAbstractItemView.SelectRows)
109
- self.job_Table.setEditTriggers(QtWidgets.QAbstractItemView.NoEditTriggers)
110
- self.job_Table.setSelectionMode(QtWidgets.QAbstractItemView.MultiSelection)
111
- self.job_Table.setSizeAdjustPolicy(QtWidgets.QAbstractScrollArea.AdjustToContents)
112
- self.fin_index=0
113
-
114
- self.mwfg = self.frameGeometry() ##Center window
115
- self.cp = QtWidgets.QDesktopWidget().availableGeometry().center() ##Center window
116
- self.total_chrom_count = 0
117
- self.perc_increase = 0
118
- self.progress = 0
119
-
120
- #toolbar button actions
121
- self.visit_repo.triggered.connect(repo_page)
122
- self.go_ncbi.triggered.connect(ncbi_page)
123
-
124
- self.comboBoxEndo.currentIndexChanged.connect(self.changeEndos)
125
-
126
- ### NCBI tool
127
- self.NCBI_File_Search.clicked.connect(self.open_ncbi_tool)
128
-
129
- self.seed_length.setEnabled(False)
130
- self.five_length.setEnabled(False)
131
- self.three_length.setEnabled(False)
132
- self.repeats_box.setEnabled(False)
133
-
134
- ### User prompt class
135
- self.goToPrompt = goToPrompt()
136
- self.goToPrompt.goToMain.clicked.connect(self.continue_to_main)
137
- self.goToPrompt.goToMT.clicked.connect(self.continue_to_MT)
138
- self.goToPrompt.goToPop.clicked.connect(self.continue_to_pop)
139
-
140
- self.orgName.setFocus()
141
-
142
- ### Connect New endonuclease to New Genome
143
- self.actionUpload_New_Endonuclease.triggered.connect(self.launch_newEndonuclease)
144
-
145
- ### Set up validators for input fields:
146
- reg_ex1 = QtCore.QRegExp("[^/\\\\_]+") # No slashes or underscores
147
- reg_ex2 = QtCore.QRegExp("\\S+")
148
- input_validator1 = QtGui.QRegExpValidator(reg_ex1, self)
149
- input_validator2 = QtGui.QRegExpValidator(reg_ex2, self)
150
- self.orgName.setValidator(input_validator1)
151
- self.strainName.setValidator(input_validator1)
152
- self.orgCode.setValidator(input_validator2)
153
-
154
- scale_ui(self, custom_scale_width=850, custom_scale_height=750)
155
- self.first_show = True
156
- except Exception as e:
157
- show_error("Error initializing New Genome class.", e)
158
-
159
- def launch_newEndonuclease(self):
160
- try:
161
- GlobalSettings.mainWindow.getData()
162
- GlobalSettings.mainWindow.newEndonuclease.centerUI()
163
- GlobalSettings.mainWindow.newEndonuclease.show()
164
- GlobalSettings.mainWindow.newEndonuclease.activateWindow()
165
- except Exception as e:
166
- show_error("Error in launch_newEndonuclease() in New Genome.", e)
167
-
168
- #open the ncbi search tool window
169
- def open_ncbi_tool(self):
170
- try:
171
- #center ncbi on current screen
172
- if GlobalSettings.mainWindow.ncbi.first_show == True:
173
- GlobalSettings.mainWindow.ncbi.first_show = False
174
- GlobalSettings.mainWindow.ncbi.centerUI()
175
- if self.orgName.text() != "":
176
- GlobalSettings.mainWindow.ncbi.organism_line_edit.setText(self.orgName.text())
177
- if self.strainName.text() != "":
178
- GlobalSettings.mainWindow.ncbi.infra_name_line_edit.setText(self.strainName.text())
179
- GlobalSettings.mainWindow.ncbi.show()
180
- GlobalSettings.mainWindow.ncbi.activateWindow()
181
- except Exception as e:
182
- show_error("Error in open_ncbi_tool() in New Genome.", e)
183
-
184
- def remove_from_queue(self):
185
- try:
186
- while(True):
187
- indexes = self.job_Table.selectionModel().selectedRows()
188
- if len(indexes) == 0:
189
- break
190
- self.job_Table.removeRow(indexes[0].row())
191
- except Exception as e:
192
- show_error("Error in remove_from_queue() in New Genome.", e)
193
-
194
- #prompt user with file browser to select fasta/fna files
195
- def selectFasta(self):
196
- try:
197
- filed = QtWidgets.QFileDialog()
198
- myFile = QtWidgets.QFileDialog.getOpenFileName(filed, "Choose a File")
199
- if (myFile[0] != ""):
200
- if not myFile[0].endswith(".fa") and not myFile[0].endswith(".fna") and not myFile[0].endswith(".fasta"):
201
- show_message(
202
- fontSize=12,
203
- icon=QtWidgets.QMessageBox.Icon.Critical,
204
- title="File Selection Error",
205
- message="You have selected an incorrect type of file. Please choose a FASTA/FNA file."
206
- )
207
- return
208
- else:
209
- self.file = myFile[0]
210
- self.selectedFile.setText(str(myFile[0]))
211
- except Exception as e:
212
- show_error("Error in selectFasta() in New Genome.", e)
213
-
214
- #submit jobs to queue
215
- def submit(self):
216
- try:
217
- warning = ""
218
- if len(self.orgName.text()) == 0:
219
- warning = warning + "You need to include the organism's name."
220
- if len(self.file) == 0:
221
- warning = warning + "You need to select a file."
222
- if len(warning) != 0:
223
- show_message(
224
- fontSize=12,
225
- icon=QtWidgets.QMessageBox.Icon.Critical,
226
- title="Required Information",
227
- message=warning
228
- )
229
- return
230
- if len(self.strainName.text()) == 0:
231
- warning = warning + "\nIt is recommended to include the organism's subspecies/strain."
232
- if len(self.orgCode.text()) == 0:
233
- warning = warning + "\nYou must include an organism code."
234
- if len(warning) != 0:
235
- msgBox = QtWidgets.QMessageBox()
236
- msgBox.setStyleSheet("font: " + str(self.fontSize) + "pt 'Arial'")
237
- msgBox.setIcon(QtWidgets.QMessageBox.Icon.Question)
238
- msgBox.setWindowTitle("Missing Information")
239
- msgBox.setText(warning + "\n\nDo you wish to continue without including this information?")
240
- msgBox.addButton(QtWidgets.QMessageBox.StandardButton.Yes)
241
- msgBox.addButton(QtWidgets.QMessageBox.StandardButton.No)
242
- msgBox.exec()
243
-
244
- if msgBox.result() == QtWidgets.QMessageBox.No:
245
- return
246
-
247
- #endo, pam, repeats, directionality, five length, seed length, three length, orgcode, output path, CASPERinfo path, fna path, orgName, notes, on target matrix
248
- args = self.Endos[self.comboBoxEndo.currentText()][0]
249
- args += " " + self.Endos[self.comboBoxEndo.currentText()][1]
250
- if self.mt.isChecked():
251
- args += " " + "TRUE"
252
- else:
253
- args += " " + "FALSE"
254
-
255
- if self.Endos[self.comboBoxEndo.currentText()][5] == "3":
256
- args += " " + "FALSE"
257
- else:
258
- args += " " + "TRUE"
259
-
260
- if self.repeats_box.isChecked():
261
- args += " " + "TRUE"
262
- else:
263
- args += " " + "FALSE"
264
-
265
- args += " " + self.Endos[self.comboBoxEndo.currentText()][2]
266
- args += " " + self.Endos[self.comboBoxEndo.currentText()][3]
267
- args += " " + self.Endos[self.comboBoxEndo.currentText()][4]
268
- args += " " + self.orgCode.text()
269
- if platform.system() == 'Windows':
270
- args += " " + '"' + GlobalSettings.CSPR_DB.replace("/","\\") + '\\"'
271
- args += " " + '"' + GlobalSettings.appdir.replace("/","\\") + "CASPERinfo" + '"'
272
- args += " " + '"' + self.file.replace("/","\\") + '"'
273
- else:
274
- args += " " + '"' + GlobalSettings.CSPR_DB.replace("\\","/") + '/"'
275
- args += " " + '"' + GlobalSettings.appdir.replace("\\","/") + "CASPERinfo" + '"'
276
- args += " " + '"' + self.file.replace("\\","/") + '"'
277
-
278
- args += " " + '"' + self.orgName.text() + " " + self.strainName.text() + '"'
279
- args += " " + '"' + "notes" + '"'
280
- args += " " + '"DATA:' + self.Endos[self.comboBoxEndo.currentText()][6] + '"'
281
-
282
- tmp = self.orgName.text()+ " " + self.strainName.text() + " " + self.Endos[self.comboBoxEndo.currentText()][0] + " " + self.orgCode.text()
283
- if tmp in self.check_strings:
284
- show_message(
285
- fontSize=12,
286
- icon=QtWidgets.QMessageBox.Icon.Critical,
287
- title="Duplicate Entry",
288
- message="You have submitted a duplicate entry. Consider changing the organism code or strain name to differentiate closely related strains."
289
- )
290
- return
291
- name = self.orgCode.text() + "_" + str(self.Endos[self.comboBoxEndo.currentText()][0])
292
- rowPosition = self.job_Table.rowCount()
293
- self.job_Table.insertRow(rowPosition)
294
- item = QtWidgets.QTableWidgetItem(name)
295
- item.setTextAlignment(QtCore.Qt.AlignHCenter)
296
- self.job_Table.setItem(rowPosition, 0, item)
297
- self.check_strings.append(tmp)
298
- self.JobsQueue.append(args)
299
- except Exception as e:
300
- show_error("Error in submit() in New Genome.", e)
301
-
302
- #fill the endo dropdown
303
- def fillEndo(self):
304
- try:
305
- #disconnect signal
306
- try:
307
- self.comboBoxEndo.currentIndexChanged.disconnect()
308
- except:
309
- pass
310
-
311
- #clear out the endo box
312
- self.comboBoxEndo.clear()
313
-
314
- f = open(GlobalSettings.appdir + "CASPERinfo")
315
- while True:
316
- line = f.readline()
317
- if line.startswith('ENDONUCLEASES'):
318
- while True:
319
- line = f.readline()
320
- if (line[0] == "-"):
321
- break
322
- line_tokened = line.split(";")
323
- if len(line_tokened) == 10:
324
- endo = line_tokened[0]
325
- # Checking to see if there is more than one pam sequence in the list
326
- if line_tokened[1].find(",") != -1:
327
- p_pam = line_tokened[1].split(",")[0]
328
- else:
329
- p_pam = line_tokened[1]
330
- five_length = line_tokened[2]
331
- seed_length = line_tokened[3]
332
- three_length = line_tokened[4]
333
- dir = line_tokened[5]
334
- on_target_data = line_tokened[8]
335
- self.Endos[endo + " - PAM: " + p_pam] = (endo, p_pam, five_length, seed_length, three_length, dir, on_target_data)
336
- break
337
- f.close()
338
- self.comboBoxEndo.addItems(self.Endos.keys())
339
- key = list(self.Endos.keys())[0]
340
- self.seed_length.setText(self.Endos[key][3])
341
- self.five_length.setText(self.Endos[key][2])
342
- self.three_length.setText(self.Endos[key][4])
343
-
344
- #reconnect signal
345
- self.comboBoxEndo.currentIndexChanged.connect(self.changeEndos)
346
- except Exception as e:
347
- show_error("Error in fillEndo() in New Genome.", e)
348
-
349
- #event handler for endo changing - update endo length data
350
- def changeEndos(self):
351
- try:
352
- key = str(self.comboBoxEndo.currentText())
353
- self.seed_length.setText(self.Endos[key][3])
354
- self.five_length.setText(self.Endos[key][2])
355
- self.three_length.setText(self.Endos[key][4])
356
- except Exception as e:
357
- show_error("Error in changeEndos() in New Genome.", e)
358
-
359
- #check if endo is 3' or 5'
360
- def endo_settings(self):
361
- try:
362
- # check the if it's 3' or 5', and check the box accordingly
363
- if int(self.seqTrans.endo_info[self.Endos[self.comboBoxEndo.currentText()][0]][3]) == 3:
364
- self.pamBox.setChecked(0)
365
- elif int(self.seqTrans.endo_info[self.Endos[self.comboBoxEndo.currentText()][0]][3]) == 5:
366
- self.pamBox.setChecked(1)
367
- except Exception as e:
368
- show_error("Error in endo_settings() in New Genome.", e)
369
-
370
- #wrapper for running jobs
371
- def run_jobs_wrapper(self):
372
- try:
373
- self.indexes = []
374
- self.job_Table.selectAll()
375
- indexes = self.job_Table.selectionModel().selectedRows()
376
- for index in sorted(indexes):
377
- if self.job_Table.item(index.row(), 0).text() != "":
378
- self.indexes.append(index.row())
379
- self.run_job()
380
- except Exception as e:
381
- show_error("Error in run_jobs_wrapper() in New Genome.", e)
382
-
383
- #run job in queue
384
- def run_job(self):
385
- try:
386
- if len(self.indexes) > 0:
387
- self.progressBar.setValue(0)
388
- self.progress = 0
389
- row_index = self.indexes[0]
390
- name = self.job_Table.item(row_index, 0).text()
391
- item = QtWidgets.QTableWidgetItem(name)
392
- item.setTextAlignment(QtCore.Qt.AlignHCenter)
393
- self.job_Table.setItem(row_index, 1, item)
394
- self.job_Table.setItem(row_index, 0, QtWidgets.QTableWidgetItem(""))
395
-
396
- def output_stdout(p):
397
- line = str(p.readAll())
398
- line = line[2:]
399
- line = line[:len(line) - 1]
400
- for lines in line.split(r"\n"):
401
- lines = lines.rstrip("\n")
402
- lines = lines.rstrip("\r")
403
- lines = lines.rstrip(r"\n")
404
- lines = lines.rstrip(r"\r")
405
- lines = lines.rstrip("\r\n")
406
- lines = lines.rstrip(r"\r\n")
407
- if lines != "":
408
- if lines.find("Number of Chromosomes/Scaffolds") != -1:
409
- copy = lines
410
- copy = copy.replace(" ","")
411
- copy = copy[copy.find(":")+1:]
412
- self.total_chrom_count = int(copy)
413
- self.perc_increase = ((1 / (2 * self.total_chrom_count)) * 70)
414
- self.progressBar.setValue(20)
415
- self.progress = 20
416
- elif lines.find("complete.") != -1:
417
- self.progress += self.perc_increase
418
- self.progressBar.setValue(int(self.progress))
419
- elif lines.find("Processing Targets.") != -1:
420
- self.progress = 70
421
- self.progressBar.setValue(int(self.progress))
422
- elif lines.find("Writing out uniques.") != -1:
423
- self.progress = 90
424
- self.progressBar.setValue(int(self.progress))
425
- elif lines.find("Writing out repeats.") != -1:
426
- self.progress = 95
427
- self.progressBar.setValue(int(self.progress))
428
- elif lines == "Finished.":
429
- self.progress = 100
430
- self.progressBar.setValue(int(self.progress))
431
- self.output_browser.append(lines)
432
-
433
- job_args = self.JobsQueue[row_index]
434
- if platform.system() == 'Windows':
435
- program = '"' + GlobalSettings.appdir + "SeqFinderFolder/Casper_Seq_Finder_Win.exe" + '" '
436
- elif platform.system() == 'Linux':
437
- program = '"' + GlobalSettings.appdir + "SeqFinderFolder/Casper_Seq_Finder_Lin" + '" '
438
- else:
439
- program = '"' + GlobalSettings.appdir + "SeqFinderFolder/Casper_Seq_Finder_Mac" + '" '
440
- program += job_args
441
- self.process.readyReadStandardOutput.connect(partial(output_stdout, self.process))
442
- self.process.start(program)
443
- else:
444
- show_message(
445
- fontSize=12,
446
- icon=QtWidgets.QMessageBox.Icon.Critical,
447
- title="No Jobs To Run",
448
- message="No jobs are in the queue to run. Please add a job before running."
449
- )
450
- except Exception as e:
451
- show_error("Error in run_job() in New Genome.", e)
452
-
453
- #even handler for when jobs finish execution
454
- def upon_process_finishing(self):
455
- try:
456
- row_index = self.indexes[0]
457
- name = self.job_Table.item(row_index, 1).text()
458
- item = QtWidgets.QTableWidgetItem(name)
459
- item.setTextAlignment(QtCore.Qt.AlignHCenter)
460
- self.job_Table.setItem(row_index, 2, item)
461
- self.job_Table.setItem(row_index, 1, QtWidgets.QTableWidgetItem(""))
462
- self.indexes.pop(0)
463
- if len(self.indexes) != 0:
464
- self.run_job()
465
- else:
466
- #prompt user if they want to analyze their new files
467
- center_ui(self.goToPrompt)
468
- self.goToPrompt.show()
469
- self.goToPrompt.activateWindow()
470
- except Exception as e:
471
- show_error("Error in upon_process_finishing() in New Genome.", e)
472
-
473
- #clear the job table
474
- def clear_all(self):
475
- try:
476
- self.process.kill()
477
- self.fin_index = 0
478
- self.job_Table.clearContents()
479
- self.job_Table.setRowCount(0)
480
- self.JobsQueue = []
481
- self.check_strings = []
482
- self.output_browser.clear()
483
- self.output_browser.setText("Waiting for program initiation...")
484
- self.orgName.clear()
485
- self.strainName.clear()
486
- self.orgCode.clear()
487
- self.selectedFile.clear()
488
- self.selectedFile.setPlaceholderText("Selected FASTA/FNA File")
489
- self.progressBar.setValue(0)
490
- self.first = False
491
- except Exception as e:
492
- show_error("Error in clear_all() in New Genome.", e)
493
-
494
- #reset the whole form
495
- def reset(self):
496
- try:
497
- self.orgName.clear()
498
- self.strainName.clear()
499
- self.orgCode.clear()
500
- self.selectedFile.clear()
501
- self.selectedFile.setPlaceholderText("Selected FASTA/FNA File")
502
- self.output_browser.clear()
503
- self.output_browser.setText("Waiting for program initiation...")
504
- self.file = ""
505
- except Exception as e:
506
- show_error("Error in reset() in New Genome.", e)
507
-
508
- #event handler for user wanting to close the window
509
- def closeEvent(self, event):
510
- try:
511
- # make sure that there are cspr files in the DB
512
- file_names = os.listdir(GlobalSettings.CSPR_DB)
513
- noCSPRFiles = True
514
- for file in file_names:
515
- if 'cspr' in file:
516
- noCSPRFiles = False
517
- break
518
- if noCSPRFiles == True:
519
- if self.exit == False:
520
- msgBox = QtWidgets.QMessageBox()
521
- msgBox.setStyleSheet("font: " + str(self.fontSize) + "pt 'Arial'")
522
- msgBox.setIcon(QtWidgets.QMessageBox.Icon.Question)
523
- msgBox.setWindowTitle("No CSPR file generated")
524
- msgBox.setText("No CSPR file has been generated, thus the main program cannot run. Please create a CSPR file."
525
- "Alternatively, you could quit the program. Would you like to quit?")
526
- msgBox.addButton(QtWidgets.QMessageBox.StandardButton.Yes)
527
- msgBox.addButton(QtWidgets.QMessageBox.StandardButton.No)
528
- msgBox.exec()
529
-
530
- if (msgBox.result() == QtWidgets.QMessageBox.No):
531
- event.ignore()
532
- else:
533
- event.accept()
534
- else:
535
- self.exit = False
536
- event.accept()
537
- else:
538
- self.process.kill()
539
- self.clear_all()
540
- self.goToPrompt.hide()
541
- GlobalSettings.mainWindow.fill_annotation_dropdown()
542
- if GlobalSettings.mainWindow.orgChoice.currentText() != '':
543
- GlobalSettings.mainWindow.orgChoice.currentIndexChanged.disconnect()
544
- GlobalSettings.mainWindow.orgChoice.clear()
545
- GlobalSettings.mainWindow.endoChoice.clear()
546
- GlobalSettings.mainWindow.getData()
547
- GlobalSettings.MTWin.launch()
548
- GlobalSettings.pop_Analysis.launch()
549
-
550
- if GlobalSettings.mainWindow.first_show == True:
551
- GlobalSettings.mainWindow.first_show = False
552
- GlobalSettings.mainWindow.centerUI()
553
- GlobalSettings.mainWindow.show()
554
- event.accept()
555
- except Exception as e:
556
- show_error("Error in closeEvent() in New Genome.", e)
557
-
558
- #event handler for user wanting to go to Main once jobs complete
559
- def continue_to_main(self):
560
- try:
561
- # make sure that there are cspr files in the DB
562
- file_names = os.listdir(GlobalSettings.CSPR_DB)
563
- noCSPRFiles = True
564
- for file in file_names:
565
- if 'cspr' in file:
566
- noCSPRFiles = False
567
- break
568
- if noCSPRFiles == True:
569
-
570
- msgBox = QtWidgets.QMessageBox()
571
- msgBox.setStyleSheet("font: " + str(self.fontSize) + "pt 'Arial'")
572
- msgBox.setIcon(QtWidgets.QMessageBox.Icon.Question)
573
- msgBox.setWindowTitle("No CSPR file generated")
574
- msgBox.setText(
575
- "No CSPR file has been generated, thus the main program cannot run. Please create a CSPR file."
576
- "Alternatively, you could quit the program. Would you like to quit?")
577
- msgBox.addButton(QtWidgets.QMessageBox.StandardButton.Yes)
578
- msgBox.addButton(QtWidgets.QMessageBox.StandardButton.No)
579
- msgBox.exec()
580
-
581
- if (msgBox.result() == QtWidgets.QMessageBox.Yes):
582
- self.exit = True
583
- self.close()
584
- else:
585
- self.process.kill()
586
- self.clear_all()
587
- self.goToPrompt.hide()
588
- GlobalSettings.mainWindow.fill_annotation_dropdown()
589
- if GlobalSettings.mainWindow.orgChoice.currentText() != '':
590
- GlobalSettings.mainWindow.orgChoice.currentIndexChanged.disconnect()
591
- GlobalSettings.mainWindow.orgChoice.clear()
592
- GlobalSettings.mainWindow.endoChoice.clear()
593
- GlobalSettings.mainWindow.getData()
594
- GlobalSettings.MTWin.launch()
595
- GlobalSettings.pop_Analysis.launch()
596
-
597
- # center main on current screen
598
- if GlobalSettings.mainWindow.first_show == True:
599
- GlobalSettings.mainWindow.first_show = False
600
- center_ui(GlobalSettings.mainWindow)
601
- GlobalSettings.mainWindow.show()
602
- self.hide()
603
- except Exception as e:
604
- show_error("Error in continue_to_main() in New Genome.", e)
605
-
606
- #event handler for user wanting to go to multi-targeting once jobs complete
607
- def continue_to_MT(self):
608
- try:
609
- # make sure that there are cspr files in the DB
610
- file_names = os.listdir(GlobalSettings.CSPR_DB)
611
- noCSPRFiles = True
612
- for file in file_names:
613
- if 'cspr' in file:
614
- noCSPRFiles = False
615
- break
616
- if noCSPRFiles == True:
617
-
618
- msgBox = QtWidgets.QMessageBox()
619
- msgBox.setStyleSheet("font: " + str(self.fontSize) + "pt 'Arial'")
620
- msgBox.setIcon(QtWidgets.QMessageBox.Icon.Question)
621
- msgBox.setWindowTitle("No CSPR file generated")
622
- msgBox.setText(
623
- "No CSPR file has been generated, thus the main program cannot run. Please create a CSPR file."
624
- "Alternatively, you could quit the program. Would you like to quit?")
625
- msgBox.addButton(QtWidgets.QMessageBox.StandardButton.Yes)
626
- msgBox.addButton(QtWidgets.QMessageBox.StandardButton.No)
627
- msgBox.exec()
628
-
629
-
630
-
631
- if (msgBox.result() == QtWidgets.QMessageBox.Yes):
632
- self.exit = True
633
- self.close()
634
-
635
- else:
636
- self.process.kill()
637
- self.clear_all()
638
- self.goToPrompt.hide()
639
- GlobalSettings.mainWindow.fill_annotation_dropdown()
640
- if GlobalSettings.mainWindow.orgChoice.currentText() != '':
641
- GlobalSettings.mainWindow.orgChoice.currentIndexChanged.disconnect()
642
- GlobalSettings.mainWindow.orgChoice.clear()
643
- GlobalSettings.mainWindow.endoChoice.clear()
644
- GlobalSettings.mainWindow.getData()
645
- GlobalSettings.MTWin.launch()
646
- GlobalSettings.pop_Analysis.launch()
647
-
648
- # center multi-targeting on current screen
649
- if GlobalSettings.MTWin.first_show == True:
650
- GlobalSettings.MTWin.first_show = False
651
- GlobalSettings.MTWin.centerUI()
652
-
653
- GlobalSettings.MTWin.show()
654
- self.hide()
655
- except Exception as e:
656
- show_error("Error in continue_to_MT() in New Genome.", e)
657
-
658
- #event handler for user wanting to go to population analysis once jobs complete
659
- def continue_to_pop(self):
660
- try:
661
- # make sure that there are cspr files in the DB
662
- file_names = os.listdir(GlobalSettings.CSPR_DB)
663
- noCSPRFiles = True
664
- for file in file_names:
665
- if 'cspr' in file:
666
- noCSPRFiles = False
667
- break
668
- if noCSPRFiles == True:
669
-
670
- msgBox = QtWidgets.QMessageBox()
671
- msgBox.setStyleSheet("font: " + str(self.fontSize) + "pt 'Arial'")
672
- msgBox.setIcon(QtWidgets.QMessageBox.Icon.Question)
673
- msgBox.setWindowTitle("No CSPR file generated")
674
- msgBox.setText(
675
- "No CSPR file has been generated, thus the main program cannot run. Please create a CSPR file."
676
- "Alternatively, you could quit the program. Would you like to quit?")
677
- msgBox.addButton(QtWidgets.QMessageBox.StandardButton.Yes)
678
- msgBox.addButton(QtWidgets.QMessageBox.StandardButton.No)
679
- msgBox.exec()
680
-
681
- if (msgBox.result() == QtWidgets.QMessageBox.Yes):
682
- self.exit = True
683
- self.close()
684
-
685
- else:
686
- self.process.kill()
687
- self.clear_all()
688
- self.goToPrompt.hide()
689
- GlobalSettings.mainWindow.fill_annotation_dropdown()
690
- if GlobalSettings.mainWindow.orgChoice.currentText() != '':
691
- GlobalSettings.mainWindow.orgChoice.currentIndexChanged.disconnect()
692
- GlobalSettings.mainWindow.orgChoice.clear()
693
- GlobalSettings.mainWindow.endoChoice.clear()
694
- GlobalSettings.mainWindow.getData()
695
- GlobalSettings.MTWin.launch()
696
- GlobalSettings.pop_Analysis.launch()
697
-
698
- if GlobalSettings.pop_Analysis.first_show == True:
699
- GlobalSettings.pop_Analysis.first_show = False
700
- GlobalSettings.pop_Analysis.centerUI()
701
-
702
- GlobalSettings.pop_Analysis.show()
703
- self.hide()
704
- except Exception as e:
705
- show_error("Error in continue_to_pop() in New Genome.", e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/views/PopulationAnalysisWindowView.py CHANGED
@@ -5,7 +5,7 @@ from matplotlib.figure import Figure
5
  import mplcursors
6
  import numpy as np
7
  import matplotlib.patches as patches
8
- from utils.ui import show_error, scale_ui
9
 
10
  class PopulationAnalysisWindowView(QtWidgets.QMainWindow):
11
  def __init__(self, global_settings):
 
5
  import mplcursors
6
  import numpy as np
7
  import matplotlib.patches as patches
8
+ from utils.ui import show_error
9
 
10
  class PopulationAnalysisWindowView(QtWidgets.QMainWindow):
11
  def __init__(self, global_settings):
src/views/ScoringOptionsView.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PyQt6 import QtWidgets, uic
2
+ from PyQt6.QtCore import pyqtSignal
3
+ import traceback
4
+
5
+ class ScoringOptionsView(QtWidgets.QMainWindow):
6
+ # Define signals
7
+ fasta_selected = pyqtSignal(str) # Signal when FASTA file is selected
8
+ submit_clicked = pyqtSignal() # Signal when submit button is clicked
9
+
10
+ def __init__(self, global_settings):
11
+ super().__init__()
12
+ self.settings = global_settings
13
+ self.logger = self.settings.get_logger()
14
+ self._init_ui()
15
+
16
+ def _init_ui(self):
17
+ try:
18
+ uic.loadUi(self.settings.get_ui_dir_path() + '/scoring_options.ui', self)
19
+
20
+ # Get UI elements
21
+ self.push_button_browse = self.findChild(QtWidgets.QPushButton, 'pbtnBrowse')
22
+ self.push_button_submit = self.findChild(QtWidgets.QPushButton, 'pbtnSubmit')
23
+ self.line_edit_fasta = self.findChild(QtWidgets.QLineEdit, 'ledInputFASTA')
24
+ self.radio_button_azimuth = self.findChild(QtWidgets.QRadioButton, 'rbtnAzimuth')
25
+
26
+ # Connect signals
27
+ self.push_button_browse.clicked.connect(self._browse_fasta)
28
+ self.push_button_submit.clicked.connect(self.submit_clicked.emit)
29
+
30
+ # Set window title
31
+ self.setWindowTitle("Select Scoring Algorithm")
32
+
33
+ # Apply theme
34
+ self.apply_theme()
35
+
36
+ except Exception as e:
37
+ self.logger.error(f"Error initializing ScoringOptionsView: {str(e)}")
38
+ self.logger.error(f"Stack trace: {traceback.format_exc()}")
39
+ raise
40
+
41
+ def apply_theme(self):
42
+ """Apply the current theme to the window"""
43
+ try:
44
+ current_theme = self.settings.get_theme()
45
+ themes = {
46
+ "dark": {
47
+ "bg_color": "#2b2b2b",
48
+ "fg_color": "#ffffff",
49
+ "button_bg_color": "#3a3a3a",
50
+ "button_border_color": "#5a5a5a",
51
+ "button_hover_bg_color": "#4a4a4a",
52
+ "input_bg_color": "#3a3a3a",
53
+ "input_border_color": "#5a5a5a",
54
+ "progress_bar_bg": "#3a3a3a",
55
+ "progress_bar_chunk": "#51b85e"
56
+ },
57
+ "light": {
58
+ "bg_color": "#f0f0f0",
59
+ "fg_color": "#000000",
60
+ "button_bg_color": "#e0e0e0",
61
+ "button_border_color": "#c0c0c0",
62
+ "button_hover_bg_color": "#d0d0d0",
63
+ "input_bg_color": "#ffffff",
64
+ "input_border_color": "#c0c0c0",
65
+ "progress_bar_bg": "#e0e0e0",
66
+ "progress_bar_chunk": "#51b85e"
67
+ }
68
+ }
69
+
70
+ theme = themes["dark"] if current_theme == "dark" else themes["light"]
71
+
72
+ # Set the stylesheet
73
+ self.setStyleSheet(f"""
74
+ QMainWindow, QWidget {{
75
+ background-color: {theme['bg_color']};
76
+ color: {theme['fg_color']};
77
+ }}
78
+ QPushButton {{
79
+ background-color: {theme['button_bg_color']};
80
+ border: 1px solid {theme['button_border_color']};
81
+ padding: 5px;
82
+ min-width: 80px;
83
+ }}
84
+ QPushButton:hover {{
85
+ background-color: {theme['button_hover_bg_color']};
86
+ }}
87
+ QLineEdit {{
88
+ background-color: {theme['input_bg_color']};
89
+ border: 1px solid {theme['input_border_color']};
90
+ padding: 5px;
91
+ }}
92
+ QRadioButton {{
93
+ color: {theme['fg_color']};
94
+ }}
95
+ QProgressBar {{
96
+ border: 1px solid {theme['button_border_color']};
97
+ background-color: {theme['progress_bar_bg']};
98
+ text-align: center;
99
+ }}
100
+ QProgressBar::chunk {{
101
+ background-color: {theme['progress_bar_chunk']};
102
+ }}
103
+ QGroupBox {{
104
+ border: 1px solid {theme['button_border_color']};
105
+ margin-top: 0.5em;
106
+ padding-top: 0.5em;
107
+ }}
108
+ QGroupBox::title {{
109
+ color: {theme['fg_color']};
110
+ subcontrol-origin: margin;
111
+ left: 10px;
112
+ padding: 0 3px 0 3px;
113
+ }}
114
+ """)
115
+
116
+ except Exception as e:
117
+ self.logger.error(f"Error applying theme: {str(e)}")
118
+ self.logger.error(f"Stack trace: {traceback.format_exc()}")
119
+
120
+ def _browse_fasta(self):
121
+ try:
122
+ # Get database directory path
123
+ db_path = self.settings.get_db_path()
124
+
125
+ file_dialog = QtWidgets.QFileDialog()
126
+ file_path, _ = QtWidgets.QFileDialog.getOpenFileName(
127
+ file_dialog,
128
+ "Choose FASTA File",
129
+ db_path, # Set initial directory to database path
130
+ "FASTA Files (*.fa *.fasta *.fna)"
131
+ )
132
+
133
+ if file_path:
134
+ self.line_edit_fasta.setText(file_path)
135
+ self.fasta_selected.emit(file_path)
136
+ self.logger.debug(f"Selected FASTA file: {file_path}")
137
+
138
+ except Exception as e:
139
+ self.logger.error(f"Error browsing FASTA file: {str(e)}")
140
+ QtWidgets.QMessageBox.critical(
141
+ self,
142
+ "Error",
143
+ f"Error selecting FASTA file: {str(e)}"
144
+ )
145
+
146
+ def get_selected_algorithm(self):
147
+ """Get the currently selected scoring algorithm"""
148
+ if self.radio_button_azimuth.isChecked():
149
+ return "Azimuth 2.0"
150
+ return None
151
+
152
+ def get_fasta_path(self):
153
+ """Get the selected FASTA file path"""
154
+ return self.line_edit_fasta.text()
155
+
156
+ def show_error(self, title, message):
157
+ """Show error message box"""
158
+ QtWidgets.QMessageBox.critical(self, title, message)
159
+
160
+ def show_info(self, title, message):
161
+ """Show info message box"""
162
+ QtWidgets.QMessageBox.information(self, title, message)
src/views/ViewTargetsView.py CHANGED
@@ -1,10 +1,16 @@
1
  from typing import Optional
2
- from PyQt6 import QtWidgets, uic
3
  from PyQt6.QtWidgets import QTableWidgetItem, QAbstractItemView
4
  from PyQt6.QtGui import QTextDocument
 
5
  from utils.ui import show_error
 
 
6
 
7
  class ViewTargetsView(QtWidgets.QMainWindow):
 
 
 
8
  def __init__(self, global_settings):
9
  super().__init__()
10
  self.settings = global_settings
@@ -26,6 +32,9 @@ class ViewTargetsView(QtWidgets.QMainWindow):
26
 
27
  self.push_button_export_grna = self._find_widget('pbtnExportgRNA', QtWidgets.QPushButton)
28
 
 
 
 
29
  def _init_grpGuideViewer(self):
30
  self.combo_box_gene = self._find_widget('cmbGene', QtWidgets.QComboBox)
31
  self.combo_box_endonuclease = self._find_widget('cmbEndonuclease', QtWidgets.QComboBox)
@@ -61,63 +70,184 @@ class ViewTargetsView(QtWidgets.QMainWindow):
61
  return widget
62
 
63
  def display_targets_in_table(self, targets):
64
- """Display targets in table with all data"""
65
- self.table_targets.setRowCount(len(targets))
66
-
67
- for row, target in enumerate(targets):
68
- # Handle tuple format (location, sequence, pam, score, strand, endonuclease)
69
- if isinstance(target, tuple):
70
- self.table_targets.setItem(row, 0, QTableWidgetItem(str(target[0]))) # Location
71
- self.table_targets.setItem(row, 1, QTableWidgetItem(str(target[5]))) # Endonuclease
72
- self.table_targets.setItem(row, 2, QTableWidgetItem(str(target[1]))) # Sequence
73
- self.table_targets.setItem(row, 3, QTableWidgetItem(str(target[4]))) # Strand
74
- self.table_targets.setItem(row, 4, QTableWidgetItem(str(target[2]))) # PAM
75
- self.table_targets.setItem(row, 5, QTableWidgetItem(str(target[3]))) # Score
76
- self.table_targets.setItem(row, 6, QTableWidgetItem("--.--")) # Off-Target placeholder
77
- # Handle dictionary format
 
 
 
 
 
 
 
 
 
78
  else:
79
- self.table_targets.setItem(row, 0, QTableWidgetItem(str(target['location'])))
80
- self.table_targets.setItem(row, 1, QTableWidgetItem(str(target['endonuclease'])))
81
- self.table_targets.setItem(row, 2, QTableWidgetItem(str(target['sequence'])))
82
- self.table_targets.setItem(row, 3, QTableWidgetItem(str(target['strand'])))
83
- self.table_targets.setItem(row, 4, QTableWidgetItem(str(target['pam'])))
84
- self.table_targets.setItem(row, 5, QTableWidgetItem(str(target['score'])))
85
- self.table_targets.setItem(row, 6, QTableWidgetItem("--.--"))
86
-
87
- # Add details button
88
- details_button = QtWidgets.QPushButton("Details")
89
- self.table_targets.setCellWidget(row, 7, details_button)
90
-
91
- self.table_targets.resizeColumnsToContents()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  def get_selected_targets(self):
94
  """Get selected targets with all necessary data"""
95
- selected_rows = set(index.row() for index in self.table_targets.selectedIndexes())
96
- selected_targets = []
97
-
98
- # Get column indices once
99
- columns = {
100
- 'location': 0,
101
- 'endonuclease': 1,
102
- 'sequence': 2, # Make sure to get the sequence
103
- 'strand': 3,
104
- 'pam': 4,
105
- 'score': 5,
106
- 'off_target': 6
107
- }
108
-
109
- for row in selected_rows:
110
- target = {
111
- 'location': self.table_targets.item(row, columns['location']).text(),
112
- 'endonuclease': self.table_targets.item(row, columns['endonuclease']).text(),
113
- 'sequence': self.table_targets.item(row, columns['sequence']).text(), # Get sequence
114
- 'strand': self.table_targets.item(row, columns['strand']).text(),
115
- 'pam': self.table_targets.item(row, columns['pam']).text(),
116
- 'score': self.table_targets.item(row, columns['score']).text(),
117
- 'off_target': self.table_targets.item(row, columns['off_target']).text()
118
  }
119
- selected_targets.append(target)
120
- return selected_targets
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  def get_row_data(self, row):
123
  return {
@@ -134,10 +264,53 @@ class ViewTargetsView(QtWidgets.QMainWindow):
134
  self.combo_box_endonuclease.addItems(endonucleases)
135
 
136
  def set_combo_box_gene(self, genes):
137
- self.combo_box_gene.addItems(genes)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
  def set_text_edit_gene_viewer(self, sequence):
140
- self.text_edit_gene_viewer.setText(sequence)
 
 
 
 
 
 
 
 
 
141
 
142
  def update_gene_info(self, info):
143
  # Implement this method if you have a widget to display gene info
@@ -180,3 +353,53 @@ class ViewTargetsView(QtWidgets.QMainWindow):
180
  def get_export_file_path(self):
181
  # Implement this method to get the export file path from the user
182
  return QtWidgets.QFileDialog.getSaveFileName(self, 'Save File')[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from typing import Optional
2
+ from PyQt6 import QtWidgets, uic, QtCore
3
  from PyQt6.QtWidgets import QTableWidgetItem, QAbstractItemView
4
  from PyQt6.QtGui import QTextDocument
5
+ from PyQt6.QtCore import Qt, pyqtSignal
6
  from utils.ui import show_error
7
+ import time
8
+ import traceback
9
 
10
  class ViewTargetsView(QtWidgets.QMainWindow):
11
+ # Define the signal
12
+ gene_selected = pyqtSignal(str) # Signal to emit when gene is selected
13
+
14
  def __init__(self, global_settings):
15
  super().__init__()
16
  self.settings = global_settings
 
32
 
33
  self.push_button_export_grna = self._find_widget('pbtnExportgRNA', QtWidgets.QPushButton)
34
 
35
+ # Connect gene selection change with direct signal
36
+ self.combo_box_gene.currentTextChanged.connect(self._on_gene_changed)
37
+
38
  def _init_grpGuideViewer(self):
39
  self.combo_box_gene = self._find_widget('cmbGene', QtWidgets.QComboBox)
40
  self.combo_box_endonuclease = self._find_widget('cmbEndonuclease', QtWidgets.QComboBox)
 
70
  return widget
71
 
72
  def display_targets_in_table(self, targets):
73
+ """Ultra-fast target display using virtual table and minimal UI updates"""
74
+ try:
75
+ start_time = time.time()
76
+
77
+ # Store complete set of targets if not already stored
78
+ if not hasattr(self, '_complete_targets'):
79
+ self._complete_targets = targets
80
+
81
+ # Filter targets for currently selected gene
82
+ selected_text = self.combo_box_gene.currentText()
83
+ # Extract locus tag from "locus_tag: gene_name" format
84
+ selected_locus = selected_text.split(': ')[0] if ': ' in selected_text else selected_text
85
+
86
+ if selected_locus:
87
+ # Filter targets with more robust comparison
88
+ filtered_targets = []
89
+ for target in self._complete_targets:
90
+ target_locus = str(target.get('feature_id', '')).strip()
91
+ if target_locus.lower() == selected_locus.lower():
92
+ filtered_targets.append(target)
93
+
94
+ # Store filtered results
95
+ self._all_results = filtered_targets
96
  else:
97
+ filtered_targets = self._complete_targets
98
+ self._all_results = filtered_targets
99
+
100
+ total_rows = len(filtered_targets)
101
+
102
+ # Completely freeze UI
103
+ self.setUpdatesEnabled(False)
104
+ self.table_targets.setUpdatesEnabled(False)
105
+ self.table_targets.setSortingEnabled(False)
106
+ self.table_targets.setVisible(False)
107
+
108
+ try:
109
+ # Pre-allocate table
110
+ self.table_targets.clearContents()
111
+ self.table_targets.setRowCount(total_rows)
112
+
113
+ # Get current headers to check for Azimuth column
114
+ headers = self.get_table_headers()
115
+ azimuth_index = headers.index("Azimuth 2.0") if "Azimuth 2.0" in headers else None
116
+
117
+ # Pre-create flags once
118
+ flags = Qt.ItemFlag.ItemIsEnabled | Qt.ItemFlag.ItemIsSelectable
119
+
120
+ # Load ALL rows at once
121
+ for row in range(total_rows):
122
+ target = filtered_targets[row]
123
+
124
+ # Create and set basic items
125
+ for col, value in enumerate([
126
+ target['location'], target['endonuclease'],
127
+ target['sequence'], target['strand'], target['pam']
128
+ ]):
129
+ item = QTableWidgetItem(str(value))
130
+ item.setFlags(flags)
131
+ self.table_targets.setItem(row, col, item)
132
+
133
+ # Handle score separately for numeric sorting
134
+ score_item = QTableWidgetItem()
135
+ score_item.setData(QtCore.Qt.ItemDataRole.EditRole, float(target['score']))
136
+ self.table_targets.setItem(row, 5, score_item)
137
+
138
+ # Add off-target placeholder
139
+ ot_item = QTableWidgetItem("--.--")
140
+ self.table_targets.setItem(row, 6, ot_item)
141
+
142
+ # Create details button
143
+ details_button = QtWidgets.QPushButton("Details")
144
+ self.table_targets.setCellWidget(row, 7, details_button)
145
+
146
+ # Add Azimuth score if column exists
147
+ if azimuth_index is not None and 'azimuth_score' in target:
148
+ azimuth_item = QTableWidgetItem()
149
+ azimuth_item.setData(QtCore.Qt.ItemDataRole.EditRole, float(target['azimuth_score']))
150
+ self.table_targets.setItem(row, azimuth_index, azimuth_item)
151
+
152
+ # Set column widths
153
+ column_widths = [100, 100, 200, 80, 80, 80, 80, 100]
154
+ for col, width in enumerate(column_widths):
155
+ self.table_targets.setColumnWidth(col, width)
156
+
157
+ finally:
158
+ # Re-enable UI
159
+ self.table_targets.setVisible(True)
160
+ self.table_targets.setUpdatesEnabled(True)
161
+ self.setUpdatesEnabled(True)
162
+ self.table_targets.setSortingEnabled(True)
163
+
164
+ total_time = time.time() - start_time
165
+ self.logger.debug(f"Display time: {total_time:.2f} seconds for {total_rows} rows")
166
+
167
+ except Exception as e:
168
+ self.logger.error(f"Error in display_results: {str(e)}")
169
+ show_error(self.settings, "Error displaying targets", str(e))
170
+
171
+ def _handle_scroll_virtual(self, value, total_rows, row_height, buffer_rows):
172
+ """Handle virtual scrolling with minimal updates"""
173
+ try:
174
+ if not hasattr(self, '_all_results') or not self._all_results:
175
+ return
176
+
177
+ # Calculate visible range with safety checks
178
+ viewport_height = max(1, self.table_targets.viewport().height())
179
+ row_height = max(1, row_height) # Ensure non-zero
180
+ visible_rows = viewport_height // row_height
181
+
182
+ # Calculate which rows should be visible
183
+ current_row = value // row_height if row_height > 0 else 0
184
+ start_row = max(0, current_row - buffer_rows)
185
+ end_row = min(total_rows, current_row + visible_rows + buffer_rows)
186
+
187
+ # Only update rows that aren't already loaded
188
+ for row in range(start_row, end_row):
189
+ if row < len(self._all_results) and not self.table_targets.item(row, 0):
190
+ target = self._all_results[row]
191
+
192
+ # Create and set items efficiently
193
+ for col, value in enumerate([
194
+ target['location'], target['endonuclease'],
195
+ target['sequence'], target['strand'], target['pam'],
196
+ target['score'], "--.--"
197
+ ]):
198
+ item = QTableWidgetItem(str(value))
199
+ item.setFlags(Qt.ItemFlag.ItemIsEnabled | Qt.ItemFlag.ItemIsSelectable)
200
+ self.table_targets.setItem(row, col, item)
201
+
202
+ if not self.table_targets.cellWidget(row, 7):
203
+ details_button = QtWidgets.QPushButton("Details")
204
+ self.table_targets.setCellWidget(row, 7, details_button)
205
+
206
+ except Exception as e:
207
+ self.logger.error(f"Error in _handle_scroll_virtual: {str(e)}")
208
 
209
  def get_selected_targets(self):
210
  """Get selected targets with all necessary data"""
211
+ try:
212
+ selected_rows = set(index.row() for index in self.table_targets.selectedIndexes())
213
+ selected_targets = []
214
+
215
+ # Get column indices once
216
+ columns = {
217
+ 'location': 0,
218
+ 'endonuclease': 1,
219
+ 'sequence': 2,
220
+ 'strand': 3,
221
+ 'pam': 4,
222
+ 'score': 5
 
 
 
 
 
 
 
 
 
 
 
223
  }
224
+
225
+ for row in sorted(selected_rows):
226
+ # Verify all required cells have data
227
+ if all(self.table_targets.item(row, col) is not None
228
+ for col in columns.values()):
229
+
230
+ target = {
231
+ 'location': self.table_targets.item(row, columns['location']).text(),
232
+ 'endonuclease': self.table_targets.item(row, columns['endonuclease']).text(),
233
+ 'sequence': self.table_targets.item(row, columns['sequence']).text(),
234
+ 'strand': self.table_targets.item(row, columns['strand']).text(),
235
+ 'pam': self.table_targets.item(row, columns['pam']).text(),
236
+ 'score': self.table_targets.item(row, columns['score']).text()
237
+ }
238
+ selected_targets.append(target)
239
+ else:
240
+ self.logger.warning(f"Skipping row {row} due to missing data")
241
+
242
+ if not selected_targets:
243
+ self.logger.warning("No valid targets selected")
244
+
245
+ return selected_targets
246
+
247
+ except Exception as e:
248
+ self.logger.error(f"Error getting selected targets: {str(e)}")
249
+ self.logger.error(f"Stack trace: {traceback.format_exc()}")
250
+ return []
251
 
252
  def get_row_data(self, row):
253
  return {
 
264
  self.combo_box_endonuclease.addItems(endonucleases)
265
 
266
  def set_combo_box_gene(self, genes):
267
+ """Set genes in combo box with optimized performance"""
268
+ try:
269
+ start_time = time.time()
270
+
271
+ # Disable UI updates
272
+ self.combo_box_gene.blockSignals(True)
273
+ self.combo_box_gene.setUpdatesEnabled(False)
274
+
275
+ # Clear existing items efficiently
276
+ self.combo_box_gene.clear()
277
+
278
+ # Debug logging
279
+ self.logger.debug(f"Received {len(genes)} genes")
280
+
281
+ # Add items in a single batch
282
+ if genes:
283
+ # Pre-allocate size
284
+ self.combo_box_gene.insertItems(0, genes)
285
+
286
+ # Set first item without triggering updates
287
+ if self.combo_box_gene.count() > 0:
288
+ self.combo_box_gene.setCurrentIndex(0)
289
+
290
+ self.logger.debug(f"Added {len(genes)} genes to combo box")
291
+
292
+ # Re-enable UI updates
293
+ self.combo_box_gene.setUpdatesEnabled(True)
294
+ self.combo_box_gene.blockSignals(False)
295
+
296
+ total_time = time.time() - start_time
297
+ self.logger.debug(f"Combo box update time: {total_time:.2f} seconds")
298
+
299
+ except Exception as e:
300
+ self.logger.error(f"Error setting genes in combo box: {str(e)}")
301
+ self.logger.error(f"Stack trace: {traceback.format_exc()}")
302
 
303
  def set_text_edit_gene_viewer(self, sequence):
304
+ """Update gene viewer with new sequence"""
305
+ try:
306
+ if sequence:
307
+ self.text_edit_gene_viewer.setText(sequence)
308
+ self.logger.debug(f"Updated gene viewer with sequence of length: {len(sequence)}")
309
+ else:
310
+ self.text_edit_gene_viewer.clear()
311
+ self.logger.debug("Cleared gene viewer - no sequence provided")
312
+ except Exception as e:
313
+ self.logger.error(f"Error setting gene viewer text: {str(e)}")
314
 
315
  def update_gene_info(self, info):
316
  # Implement this method if you have a widget to display gene info
 
353
  def get_export_file_path(self):
354
  # Implement this method to get the export file path from the user
355
  return QtWidgets.QFileDialog.getSaveFileName(self, 'Save File')[0]
356
+
357
+ def _on_gene_changed(self, selected_text):
358
+ """Handle gene selection change and emit signal"""
359
+ try:
360
+ self.logger.debug(f"Gene selection changed to: {selected_text}")
361
+
362
+ # Reset scroll position
363
+ self.table_targets.verticalScrollBar().setValue(0)
364
+
365
+ # Filter and display targets
366
+ if hasattr(self, '_complete_targets'):
367
+ self.display_targets_in_table(self._complete_targets)
368
+
369
+ # Emit signal for controller to update gene sequence
370
+ self.gene_selected.emit(selected_text)
371
+
372
+ except Exception as e:
373
+ self.logger.error(f"Error in _on_gene_changed: {str(e)}")
374
+ self.logger.error(f"Stack trace: {traceback.format_exc()}")
375
+
376
+ def get_table_headers(self):
377
+ """Get current table headers"""
378
+ headers = []
379
+ for i in range(self.table_targets.columnCount()):
380
+ headers.append(self.table_targets.horizontalHeaderItem(i).text())
381
+ return headers
382
+
383
+ def add_scoring_column(self, algorithm_name, position=None):
384
+ """Add a new column for alternative scoring method at specified position"""
385
+ if position is None:
386
+ # Add to end if no position specified
387
+ position = self.table_targets.columnCount()
388
+
389
+ self.table_targets.insertColumn(position)
390
+ self.table_targets.setHorizontalHeaderItem(
391
+ position,
392
+ QtWidgets.QTableWidgetItem(algorithm_name)
393
+ )
394
+
395
+ # Shift any existing columns after the insertion point
396
+ for i in range(self.table_targets.columnCount() - 1, position, -1):
397
+ for row in range(self.table_targets.rowCount()):
398
+ self.table_targets.setItem(row, i, self.table_targets.takeItem(row, i-1))
399
+
400
+ # Move column header
401
+ header_item = self.table_targets.takeHorizontalHeaderItem(i-1)
402
+ if header_item:
403
+ self.table_targets.setHorizontalHeaderItem(i, header_item)
404
+
405
+ return position
src/views/closingWin.py DELETED
@@ -1,73 +0,0 @@
1
- import models.GlobalSettings as GlobalSettings
2
- import os
3
- from PyQt5 import QtWidgets, Qt, uic
4
- import traceback
5
- import math
6
- from utils.ui import show_error, scale_ui
7
-
8
- logger = GlobalSettings.logger
9
-
10
- ###########################################################
11
- # closingWindow: this class is a little window where the user can select which files they want to delete
12
- # Once they hit 'submit' it will delete all of the files selected, and close the program.
13
- # If no files are selected, the program closes and no files are deleted
14
- # Inputs are taking from the user (selecting files to delete and hitting submit), as well as GlobalSettings for the files in CSPR_DB
15
- # Outputs are the files are deleting, and the program is closed
16
- ###########################################################
17
- class closingWindow(QtWidgets.QMainWindow):
18
- def __init__(self):
19
- try:
20
- super(closingWindow, self).__init__()
21
- uic.loadUi(GlobalSettings.appdir + "ui/closing_window.ui", self)
22
- self.setWindowTitle("Delete Files")
23
- self.setWindowIcon(Qt.QIcon(GlobalSettings.appdir + "cas9image.ico"))
24
-
25
- # Button
26
- self.submit_button.clicked.connect(self.submit_and_close)
27
-
28
- # Table
29
- self.files_table.setColumnCount(1)
30
- self.files_table.setShowGrid(True)
31
- self.files_table.setHorizontalHeaderLabels("File Name;".split(";"))
32
- self.files_table.setSelectionBehavior(QtWidgets.QAbstractItemView.SelectRows)
33
- self.files_table.setEditTriggers(QtWidgets.QAbstractItemView.NoEditTriggers)
34
- self.files_table.setSelectionMode(QtWidgets.QAbstractItemView.MultiSelection)
35
-
36
-
37
- scale_ui(self, custom_scale_width=400, custom_scale_height=300)
38
-
39
-
40
- except Exception as e:
41
- show_error("Error initializing closingWindow class.", e)
42
-
43
- # this function will delete selected files, and then close the program
44
- def submit_and_close(self):
45
- try:
46
- # loop through the whole table
47
- for i in range(self.files_table.rowCount()):
48
- tabWidget = self.files_table.item(i, 0)
49
-
50
- # if that specific tab is selected, delete it. otherwise do nothing
51
- if tabWidget.isSelected():
52
- os.remove(tabWidget.text())
53
- self.close()
54
- except Exception as e:
55
- show_error("Error in sumbit_and_close() in closing window.", e)
56
-
57
- # this function gets all of the files from the CSPR_DB and puts them all into the table
58
- def get_files(self):
59
- try:
60
- loopCount = 0
61
- # get the file names from CSPR_DB
62
- files_names = os.listdir(GlobalSettings.CSPR_DB)
63
- files_names.sort(key=str.lower)
64
- self.files_table.setRowCount(len(files_names))
65
-
66
- # loop through and add them to the table
67
- for file in files_names:
68
- tabWidget = QtWidgets.QTableWidgetItem(file)
69
- self.files_table.setItem(loopCount, 0, tabWidget)
70
- loopCount += 1
71
- self.files_table.resizeColumnsToContents()
72
- except Exception as e:
73
- show_error("Error in get_files() in closing window.", e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/views/export_tool.py DELETED
@@ -1,259 +0,0 @@
1
- import models.GlobalSettings as GlobalSettings
2
- from utils.sequence_utils import get_table_headers
3
- import os
4
- from PyQt5 import QtWidgets, Qt, uic, QtCore, QtGui
5
- import platform
6
- import traceback
7
- import math
8
- from utils.ui import show_message, show_error, scale_ui, center_ui
9
-
10
- logger = GlobalSettings.logger
11
-
12
- # This class opens a window for the user to select where they want the CSV file exported to, and the name of the file
13
- # It takes the highlighted data from the Results page, and creates a CSV file from that
14
- class export_tool(QtWidgets.QMainWindow):
15
- def __init__(self):
16
- try:
17
- super(export_tool, self).__init__()
18
- uic.loadUi(GlobalSettings.appdir + 'ui/export_tool.ui', self)
19
- self.setWindowIcon(Qt.QIcon(GlobalSettings.appdir + "cas9image.ico"))
20
-
21
- self.browse_button.clicked.connect(self.browseForFolder)
22
- self.cancel_button.clicked.connect(self.cancel_function)
23
- self.export_button.clicked.connect(self.export_function)
24
-
25
- # Set up validators for input fields:
26
- reg_ex = QtCore.QRegExp("[^,]+") # No commas
27
- input_validator = QtGui.QRegExpValidator(reg_ex, self)
28
- self.leading_seq.setValidator(input_validator)
29
- self.trailing_seq.setValidator(input_validator)
30
-
31
- # GroupBox styling
32
- groupbox_style = """
33
- QGroupBox:title{subcontrol-origin: margin;
34
- left: 10px;
35
- padding: 0 5px 0 5px;}
36
- QGroupBox#gRNA_Options{border: 2px solid rgb(111,181,110);
37
- border-radius: 9px;
38
- margin-top: 10px;
39
- font: bold 14pt 'Arial';} """
40
- self.gRNA_Options.setStyleSheet(groupbox_style)
41
-
42
- self.location = self.fileLocation_line_edit.text()
43
- self.selected_table_items = []
44
- self.window = ""
45
- self.num_columns = []
46
- self.locus_tag = False
47
- self.gene_name = False
48
-
49
- self.setWindowTitle("Export to CSV")
50
- scale_ui(self, custom_scale_width=650, custom_scale_height=200)
51
-
52
- except Exception as e:
53
- show_error("Error initializing export_tool class.", e)
54
-
55
- # launch function. Called in Results.
56
- # parameter expect: a list of the items selected from the window.
57
- def launch(self, select_items, window):
58
- try:
59
- if platform.system() == "Windows":
60
- self.fileLocation_line_edit.setText(GlobalSettings.CSPR_DB + "\\")
61
- else:
62
- self.fileLocation_line_edit.setText(GlobalSettings.CSPR_DB + "/")
63
- self.selected_table_items = select_items
64
- self.window = window
65
- center_ui(self)
66
- self.show()
67
- self.activateWindow()
68
- except Exception as e:
69
- show_error("Error in launch() in export_tool.", e)
70
-
71
- # Takes the path and file name and combines them
72
- # Writes the header line, as well as ever line selected to that file
73
- # calls the cancel function when it's done
74
- def export_function(self):
75
- try:
76
- delim = self.delimBox.currentText()
77
- # get the full path ( path and file name)
78
- file_name = self.filename_line_edit.text()
79
- if file_name == "":
80
- file_name = "exported_gRNAs"
81
- self.location = self.fileLocation_line_edit.text()
82
- full_path = ""
83
- if '.' in file_name: # If user added the file extension...
84
- full_path = self.location + file_name
85
- else:
86
- if delim == ",":
87
- full_path = self.location + file_name + '.csv'
88
- elif delim == r"\t":
89
- delim = "\t"
90
- full_path = self.location + file_name + '.tsv'
91
- else:
92
- full_path = self.location + file_name + '.txt'
93
- try:
94
- output_data = open(full_path, 'w')
95
- """ Write the table headers """
96
- if self.window == "mt": ###Change headers for multitargeting table export
97
- headers = get_table_headers(GlobalSettings.MTWin.table)
98
- num_cols = len(headers) # Calculate the number of columns based on the headers list above
99
- insertion_index = headers.index("% Consensus")
100
- headers.insert(insertion_index, "Full Sequence")
101
- output_data.write(delim.join(headers)+"\n")
102
- elif self.window == "pa":
103
- headers = get_table_headers(GlobalSettings.pop_Analysis.table2)
104
- num_cols = len(headers) # Calculate the number of columns based on the headers list above
105
- insertion_index = headers.index("% Consensus")
106
- headers.insert(insertion_index, "Full Sequence")
107
- output_data.write(delim.join(headers)+"\n")
108
- else: ###Change headers for view results export
109
- headers = get_table_headers(GlobalSettings.mainWindow.Results.targetTable)
110
- headers.remove("Details") # For some reason, the details column doesn't carry any "items"
111
- num_cols = len(headers) # Calculate the number of columns based on the headers list above
112
- insertion_index = headers.index("Strand")
113
- headers.insert(insertion_index, "Full Sequence")
114
-
115
- if GlobalSettings.mainWindow.radioButton_Gene.isChecked(): # If the user chose to search via Feature
116
- tmp = GlobalSettings.mainWindow.Results.comboBoxGene.currentText().split(":") # Check to see if the locus tag was found for the current gene
117
- if len(tmp) > 1: # If locus tag exists for gene, include in output
118
- headers.extend(["Locus_Tag","Gene_Name"])
119
- output_data.write(delim.join(headers)+"\n")
120
- self.locus_tag = True
121
- self.gene_name = True
122
- else: # If locus tag does not exist for gene, only include the gene name
123
- headers.append("Gene_Name")
124
- output_data.write(delim.join(headers)+"\n")
125
- self.gene_name = True
126
- self.locus_tag = False
127
- else: # If user searched by sequence or position, don't include locus tag or gene name
128
- output_data.write(delim.join(headers)+"\n")
129
- self.gene_name = False
130
- self.locus_tag = False
131
-
132
- """ Write the data out """
133
- tmp_list = []
134
- if self.locus_tag: #If the user is exporting data from VT and locus tag exists for current gene
135
- tmp = GlobalSettings.mainWindow.Results.comboBoxGene.currentText().split(":") # Get the locus tag
136
- locus_tag = str(tmp[0].strip())
137
- gene_name = str(tmp[-1].strip())
138
- seq_index = headers.index("Sequence") # Get the gene name
139
- it = 0
140
- for i, item in enumerate(self.selected_table_items): # Loop through all the items in the View Targets table
141
- if (i+1) % num_cols == 0:
142
- tmp_list.append(item.text())
143
- tmp_list.append(locus_tag)
144
- tmp_list.append(gene_name)
145
- output_data.write(delim.join(tmp_list)+"\n") # Write data out
146
- tmp_list.clear() # Reset list
147
- it = 0 # Reset iterator
148
- elif it == seq_index:
149
- tmp_list.append(item.text())
150
- tmp_list.append(self.leading_seq.text().strip() + item.text() + self.trailing_seq.text().strip()) #5' Leader + gRNA + 3' Trailer
151
- it += 1
152
- else:
153
- tmp_list.append(item.text())
154
- it += 1
155
- elif self.gene_name: #If the user is exporting data from VT and locus tag doesn't exist for current gene
156
- gene_name = str(GlobalSettings.mainWindow.Results.comboBoxGene.currentText().strip()) # Get the locus tag
157
- seq_index = headers.index("Sequence") # Get the gene name
158
- it = 0
159
- for i, item in enumerate(self.selected_table_items): # Loop through all the items in the View Targets table
160
- if (i+1) % num_cols == 0:
161
- tmp_list.append(item.text())
162
- tmp_list.append(gene_name)
163
- output_data.write(delim.join(tmp_list)+"\n")
164
- tmp_list.clear()
165
- it = 0 # Reset iterator
166
- elif it == seq_index:
167
- tmp_list.append(item.text())
168
- tmp_list.append(self.leading_seq.text().strip() + item.text() + self.trailing_seq.text().strip()) #5' Leader + gRNA + 3' Trailer
169
- it += 1
170
- else:
171
- tmp_list.append(item.text())
172
- it += 1
173
- elif self.window in ["mt", "pa"]: #If the user is exporting data from multitargeting
174
- seq_index = headers.index("Consensus Sequence")
175
- it = 0
176
- for i, item in enumerate(self.selected_table_items): # Loop through all the items in the View Targets table
177
- if (i+1) % num_cols == 0:
178
- tmp_list.append(item.text())
179
- output_data.write(str(delim.join(tmp_list))+"\n")
180
- tmp_list.clear()
181
- it = 0 # Reset iterator
182
- elif it == seq_index:
183
- tmp_list.append(item.text())
184
- tmp_list.append(self.leading_seq.text().strip() + item.text() + self.trailing_seq.text().strip()) #5' Leader + gRNA + 3' Trailer
185
- it += 1
186
- else:
187
- tmp_list.append(item.text())
188
- it += 1
189
- else: #If the user is exporting data from View Targets but is not using Feature search
190
- seq_index = headers.index("Sequence") # Get the gene name
191
- it = 0
192
- for i, item in enumerate(self.selected_table_items): # Loop through all the items in the View Targets table
193
- if (i+1) % num_cols == 0:
194
- tmp_list.append(item.text())
195
- output_data.write(delim.join(tmp_list)+"\n")
196
- tmp_list.clear()
197
- it = 0 # Reset iterator
198
- elif it == seq_index:
199
- tmp_list.append(item.text())
200
- tmp_list.append(self.leading_seq.text().strip() + item.text() + self.trailing_seq.text().strip()) #5' Leader + gRNA + 3' Trailer
201
- it += 1
202
- else:
203
- tmp_list.append(item.text())
204
- it += 1
205
- output_data.close()
206
- except PermissionError:
207
- show_error("This file cannot be opened. Please make sure that the file is not opened elsewhere and try again.", e)
208
- return
209
-
210
- except Exception as e:
211
- show_error("Error in export_function() in export_tool.", e)
212
- return
213
-
214
- """ Print "finished" message """
215
- show_message(
216
- fontSize=12,
217
- icon=QtWidgets.QMessageBox.Icon.Information,
218
- title="Export Complete",
219
- message=f"Export to {full_path} was successful."
220
- )
221
-
222
- # close the window
223
- self.cancel_function()
224
- except Exception as e:
225
- show_error("Error in export_function() in export_tool.", e)
226
-
227
- # Resets everything to the init funciton
228
- # then closes the window
229
- def cancel_function(self):
230
- try:
231
- if platform.system() == "Windows":
232
- self.fileLocation_line_edit.setText(GlobalSettings.CSPR_DB + "\\")
233
- else:
234
- self.fileLocation_line_edit.setText(GlobalSettings.CSPR_DB + "/")
235
- self.filename_line_edit.setText("")
236
- self.location = ""
237
- self.hide()
238
- except Exception as e:
239
- show_error("Error in cancel_function() in export_tool.", e)
240
-
241
- # browse for folder function
242
- # allows user to browse for a folder where to store the CSV file
243
- def browseForFolder(self):
244
- try:
245
- # get the folder
246
- filed = QtWidgets.QFileDialog()
247
- mydir = QtWidgets.QFileDialog.getExistingDirectory(filed, "Open a Folder",
248
- GlobalSettings.CSPR_DB, QtWidgets.QFileDialog.ShowDirsOnly)
249
- if(os.path.isdir(mydir) == False):
250
- return
251
-
252
- if platform.system() == "Windows":
253
- self.fileLocation_line_edit.setText(mydir + "\\")
254
- self.location = mydir + "\\"
255
- else:
256
- self.fileLocation_line_edit.setText(mydir + "/")
257
- self.location = mydir + "/"
258
- except Exception as e:
259
- show_error("Error in browseForFolder() in export_tool.", e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/views/generateLib.py DELETED
@@ -1,662 +0,0 @@
1
- import models.GlobalSettings as GlobalSettings
2
- import os
3
- from PyQt5 import QtWidgets, Qt, uic, QtCore
4
- from functools import partial
5
- from models.CSPRparser import CSPRparser
6
- import re
7
- import platform
8
- import traceback
9
- import math
10
- from utils.ui import show_message, show_error, scale_ui, center_ui
11
- from views.annotation_functions import *
12
-
13
- logger = GlobalSettings.logger
14
-
15
- # this class is a window that allows the user to select the settings for Generate Library
16
- # When the user clicks Generate Library, it goes ahead and gets the Annotation Data needed
17
- # Then the user can select the settings they want, and then hit submit.
18
- # It creates a txt file with the data
19
- class genLibrary(QtWidgets.QMainWindow):
20
- def __init__(self):
21
- try:
22
- super(genLibrary, self).__init__()
23
- uic.loadUi(GlobalSettings.appdir + 'ui/generate_library.ui', self)
24
- self.setWindowTitle('Generate Library')
25
- self.setWindowIcon(Qt.QIcon(GlobalSettings.appdir + 'cas9image.ico'))
26
-
27
- groupbox_style = """
28
- QGroupBox:title{subcontrol-origin: margin;
29
- left: 10px;
30
- padding: 0 5px 0 5px;}
31
- QGroupBox#Step1{border: 2px solid rgb(111,181,110);
32
- border-radius: 9px;
33
- font: bold 14pt 'Arial';
34
- margin-top: 10px;}"""
35
- self.Step1.setStyleSheet(groupbox_style)
36
- self.Step2.setStyleSheet(groupbox_style.replace("Step1", "Step2"))
37
- self.Step3.setStyleSheet(groupbox_style.replace("Step1", "Step3"))
38
- self.Step4.setStyleSheet(groupbox_style.replace("Step1", "Step4"))
39
-
40
- self.cancel_button.clicked.connect(self.cancel_function)
41
- self.BrowseButton.clicked.connect(self.browse_function)
42
- self.submit_button.clicked.connect(self.submit_data)
43
- self.progressBar.setValue(0)
44
-
45
- self.anno_data = dict()
46
- self.kegg_nonKegg = ''
47
- self.gen_lib_dict = dict()
48
- self.cspr_data = dict()
49
- self.Output = dict()
50
- self.off_tol = .05
51
- self.off_max_misMatch = 4
52
- self.off_target_running = False
53
- self.parser = CSPRparser("")
54
-
55
- # set the numbers for the num genes combo box item
56
- for i in range(10):
57
- self.numGenescomboBox.addItem(str(i + 1))
58
-
59
- # set the numbers for the minOn combo box
60
- for i in range(19, 70):
61
- self.minON_comboBox.addItem(str(i + 1))
62
-
63
- scale_ui(self, custom_scale_width=950, custom_scale_height=500)
64
-
65
- except Exception as e:
66
- show_error("Error initializing generate library class.", e)
67
-
68
- # this function launches the window
69
- # Parameters:
70
- # annotation_data: a dictionary that has the data for the annotations searched for
71
- # currently MainWindow's searches dict is passed into this
72
- # org_file: the cspr_file that pertains to the organism that user is using at the time
73
- # anno_type: whether the user is using KEGG or another type of annotation file
74
- def launch(self, annotation_data, org_file, anno_type):
75
- try:
76
- self.cspr_file = org_file
77
- self.db_file = org_file[:org_file.find('.')] + '_repeats.db'
78
- self.anno_data = annotation_data
79
- self.kegg_nonKegg = anno_type
80
- self.process = QtCore.QProcess()
81
- self.parser.fileName = org_file
82
-
83
- # setting the path and file name fields
84
- index1 = self.cspr_file.find('.')
85
- if platform.system() == "Windows":
86
- index2 = self.cspr_file.rfind('\\')
87
- else:
88
- index2 = self.cspr_file.rfind('/')
89
-
90
- self.filename_input.setText(self.cspr_file[index2 + 1:index1] + '_lib')
91
-
92
-
93
- if platform.system() == "Windows":
94
- self.output_path.setText(GlobalSettings.CSPR_DB + "\\")
95
- else:
96
- self.output_path.setText(GlobalSettings.CSPR_DB + "/")
97
-
98
- # depending on the type of file, build the dictionary accordingly
99
- self.build_dict_non_kegg()
100
-
101
- # get the gRNA data from the cspr file
102
- self.cspr_data = self.parser.gen_lib_parser(self.gen_lib_dict, GlobalSettings.mainWindow.endoChoice.currentText())
103
- self.get_endo_data()
104
-
105
- center_ui(self)
106
- self.show()
107
- self.activateWindow()
108
- except Exception as e:
109
- show_error("Error in launch() in generate library.", e)
110
-
111
- def get_endo_data(self):
112
- try:
113
- f = open(GlobalSettings.appdir + "CASPERinfo")
114
- self.endo_data = {}
115
- while True:
116
- line = f.readline()
117
- if line.startswith('ENDONUCLEASES'):
118
- while True:
119
- line = f.readline()
120
- line = line.replace("\n","")
121
- if (line[0] == "-"):
122
- break
123
- line_tokened = line.split(";")
124
- if len(line_tokened) == 10:
125
- endo = line_tokened[0]
126
- five_length = line_tokened[2]
127
- seed_length = line_tokened[3]
128
- three_length = line_tokened[4]
129
- prime = line_tokened[5]
130
- hsu = line_tokened[9]
131
- self.endo_data[endo] = [int(five_length) + int(three_length) + int(seed_length), prime, "MATRIX:" + hsu]
132
-
133
- break
134
- f.close()
135
- except Exception as e:
136
- show_error("Error in get_endo_data() in generate library.", e)
137
-
138
- # this is here in case the user clicks 'x' instead of cancel. Just calls the cancel function
139
- def closeEvent(self, event):
140
- try:
141
- closeWindow = self.cancel_function()
142
-
143
- # if the user is doing OT and does not decide to cancel it ignore the event
144
- if closeWindow == -2:
145
- event.ignore()
146
- else:
147
- event.accept()
148
- except Exception as e:
149
- show_error("Error in closeEvent() in generate library.", e)
150
-
151
- # this function takes all of the cspr data and compresses it again for off-target usage
152
- def compress_file_off(self):
153
- try:
154
- if platform.system() == "Windows":
155
- file = GlobalSettings.CSPR_DB + "\\off_input.txt"
156
- else:
157
- file = GlobalSettings.CSPR_DB + "/off_input.txt"
158
- f = open(file, 'w')
159
- for gene in self.cspr_data:
160
- for j in range(len(self.cspr_data[gene])):
161
- loc = self.cspr_data[gene][j][0]
162
- seq = self.cspr_data[gene][j][1]
163
- pam = self.cspr_data[gene][j][2]
164
- score = self.cspr_data[gene][j][3]
165
- strand = self.cspr_data[gene][j][4]
166
- output = str(loc) + ';' + str(seq) + ';' + str(pam) + ';' + str(score) + ';' + str(strand)
167
- f.write(output + '\n')
168
- f.close()
169
- except Exception as e:
170
- show_error("Error in compress_file_off() in generate library.", e)
171
-
172
- # this function parses the temp_off file, which holds the off-target analysis results
173
- # it also updates each target in the cspr_data dictionary to replace the endo with the target's results in off-target
174
- def parse_off_file(self):
175
- try:
176
- if platform.system() == "Windows":
177
- file = GlobalSettings.CSPR_DB + "\\temp_off.txt"
178
- else:
179
- file = GlobalSettings.CSPR_DB + "/temp_off.txt"
180
- f = open(file, "r")
181
- file_data = f.read().split('\n')
182
- f.close()
183
- scoreDict = dict()
184
-
185
- # get the data from the file
186
- for i in range(len(file_data)):
187
- if file_data[i] == 'AVG OUTPUT':
188
- continue
189
- elif file_data[i] != '':
190
- buffer = file_data[i].split(':')
191
- scoreDict[buffer[0]] = buffer[1]
192
-
193
- # update cspr_Data
194
- for gene in self.cspr_data:
195
- for i in range(len(self.cspr_data[gene])):
196
- tempTuple = (self.cspr_data[gene][i][0], self.cspr_data[gene][i][1], self.cspr_data[gene][i][2], self.cspr_data[gene][i][3], self.cspr_data[gene][i][4], scoreDict[self.cspr_data[gene][i][1]])
197
- self.cspr_data[gene][i] = tempTuple
198
- except Exception as e:
199
- show_error("Error in parse_off_file() in generate library.", e)
200
-
201
- # this function runs the off_target command
202
- # NOTE: some changes may be needed to get it to work with other OS besides windows
203
- def get_offTarget_data(self, num_targets, minScore, spaceValue, output_file, fiveseq):
204
- try:
205
- self.perc = False
206
- self.bool_temp = False
207
- self.running = False
208
-
209
- # when finished, parse the off file, and then generate the lib
210
- def finished():
211
- if self.off_target_running:
212
- self.progressBar.setValue(100)
213
- self.parse_off_file()
214
- did_work = self.generate(num_targets, minScore, spaceValue, output_file, fiveseq)
215
- self.off_target_running = False
216
- #self.process.kill()
217
- if did_work != -1:
218
- self.cancel_function()
219
- show_message(
220
- fontSize=12,
221
- icon=QtWidgets.QMessageBox.Icon.Information,
222
- title="Library Generated!",
223
- message="CASPER has finished generating your library!"
224
- )
225
- os.remove(GlobalSettings.CSPR_DB + '/off_input.txt')
226
- os.remove(GlobalSettings.CSPR_DB + '/temp_off.txt')
227
-
228
- # as off-targeting outputs things, update the off-target progress bar
229
- def progUpdate(p):
230
- line = str(self.process.readAllStandardOutput())
231
- line = line[2:]
232
- line = line[:len(line) - 1]
233
- if platform.system() == 'Windows':
234
- for lines in filter(None, line.split(r'\r\n')):
235
- if (lines.find("Running Off Target Algorithm for") != -1 and self.perc == False):
236
- self.perc = True
237
- if (self.perc == True and self.bool_temp == False and lines.find(
238
- "Running Off Target Algorithm for") == -1):
239
- lines = lines[32:]
240
- lines = lines.replace("%", "")
241
- if (float(lines) <= 99.5):
242
- num = float(lines)
243
- self.progressBar.setValue(num)
244
- else:
245
- self.bool_temp = True
246
- else:
247
- for lines in filter(None, line.split(r'\n')):
248
- if (lines.find("Running Off Target Algorithm for") != -1 and self.perc == False):
249
- self.perc = True
250
- if (self.perc == True and self.bool_temp == False and lines.find(
251
- "Running Off Target Algorithm for") == -1):
252
- lines = lines[32:]
253
- lines = lines.replace("%", "")
254
- if (float(lines) <= 99.5):
255
- num = float(lines)
256
- self.progressBar.setValue(num)
257
- else:
258
- self.bool_temp = True
259
-
260
- if platform.system() == 'Windows':
261
- app_path = GlobalSettings.appdir
262
- exe_path = app_path + 'OffTargetFolder\\OT_Win.exe'
263
- output_path = '"' + GlobalSettings.CSPR_DB + '\\temp_off.txt" '
264
- data_path = '"' + GlobalSettings.CSPR_DB + "\\off_input.txt" + '" '
265
- elif platform.system() == 'Linux':
266
- app_path = GlobalSettings.appdir.replace('\\', '/')
267
- exe_path = app_path + r'OffTargetFolder/OT_Lin'
268
- output_path = '"' + GlobalSettings.CSPR_DB + '/temp_off.txt" '
269
- data_path = '"' + GlobalSettings.CSPR_DB + "/off_input.txt" + '" '
270
- else:
271
- app_path = GlobalSettings.appdir.replace('\\', '/')
272
- exe_path = app_path + r'OffTargetFolder/OT_Mac'
273
- output_path = '"' + GlobalSettings.CSPR_DB + '/temp_off.txt" '
274
- data_path = '"' + GlobalSettings.CSPR_DB + "/off_input.txt" + '" '
275
- exe_path = '"' + exe_path + '" '
276
- cspr_path = '"' + self.cspr_file + '" '
277
- db_path = '"' + self.db_file + '" '
278
- filename = output_path
279
- filename = filename[:len(filename) - 1]
280
- filename = filename[1:]
281
- filename = filename.replace('"', '')
282
- CASPER_info_path = '"' + app_path + 'CASPERinfo' +'" '
283
- num_of_mismathes = self.off_max_misMatch
284
- tolerance = self.off_tol # create command string
285
- endo = '"' + GlobalSettings.mainWindow.endoChoice.currentText() + '" '
286
- detailed_output = " False "
287
- avg_output = "True"
288
- hsu = ' "' + self.endo_data[GlobalSettings.mainWindow.endoChoice.currentText()][2] + '"'
289
-
290
- # set the off_target_running to true, to keep the user from closing the window while it is running
291
- self.off_target_running = True
292
-
293
- cmd = exe_path + data_path + endo + cspr_path + db_path + output_path + CASPER_info_path + str(
294
- num_of_mismathes) + ' ' + str(tolerance) + detailed_output + avg_output + hsu
295
-
296
- if platform.system() == 'Windows':
297
- cmd = cmd.replace('/', '\\')
298
- self.process.readyReadStandardOutput.connect(partial(progUpdate, self.process))
299
- self.process.readyReadStandardError.connect(partial(progUpdate, self.process))
300
- self.progressBar.setValue(0)
301
- QtCore.QTimer.singleShot(100, partial(self.process.start, cmd))
302
- self.process.finished.connect(finished)
303
- except Exception as e:
304
- show_error("Error in get_offTarget_data() in generate library.", e)
305
-
306
- # submit function
307
- # this function takes all of the input from the window, and calls the generate function
308
- # Still need to add the checks for 5' seq, and the percentage thing
309
- def submit_data(self):
310
- try:
311
- if self.off_target_running:
312
- return
313
- output_file = self.output_path.text() + self.filename_input.text()
314
-
315
- minScore = int(self.minON_comboBox.currentText())
316
- num_targets = int(self.numGenescomboBox.currentText())
317
- fiveseq = ''
318
-
319
- # error check for csv files
320
- if output_file.endswith('.txt'):
321
- output_file = output_file.replace('.txt', '.csv')
322
- elif not output_file.endswith('.txt') and not output_file.endswith('.csv'):
323
- output_file = output_file + '.csv'
324
-
325
- # error checking for the space value
326
- # if they enter nothing, default to 15 and also make sure it's actually a digit
327
- if self.space_line_edit.text() == '':
328
- spaceValue = 15
329
- elif self.space_line_edit.text().isdigit():
330
- spaceValue = int(self.space_line_edit.text())
331
- elif not self.space_line_edit.text().isdigit():
332
- show_message(
333
- fontSize=12,
334
- icon=QtWidgets.QMessageBox.Icon.Critical,
335
- title="Error",
336
- message="Please enter integers only for space between guides."
337
- )
338
- return
339
- # if space value is more than 200, default to 200
340
- if spaceValue > 200:
341
- spaceValue = 200
342
- elif spaceValue < 0:
343
- show_message(
344
- fontSize=12,
345
- icon=QtWidgets.QMessageBox.Icon.Critical,
346
- title="Error",
347
- message="Please enter a space-value that is 0 or greater."
348
- )
349
- return
350
-
351
- if self.find_off_Checkbox.isChecked():
352
- self.compress_file_off()
353
-
354
- # get the fiveprimseq data and error check it
355
- if self.fiveprimeseq.text() != '' and self.fiveprimeseq.text().isalpha():
356
- fiveseq = self.fiveprimeseq.text()
357
- elif self.fiveprimeseq.text() != '' and not self.fiveprimeseq.text().isalpha():
358
- show_message(
359
- fontSize=12,
360
- icon=QtWidgets.QMessageBox.Icon.Critical,
361
- title="Error",
362
- message="Please make sure only the letters A, T, G, or C are added into 5' End specificity box."
363
- )
364
- return
365
-
366
- # get the targeting range data, and error check it here
367
- if not self.start_target_range.text().isdigit() or not self.end_target_range.text().isdigit():
368
- show_message(
369
- fontSize=12,
370
- icon=QtWidgets.QMessageBox.Icon.Critical,
371
- title="Error",
372
- message="Error: Please make sure that the start and end target ranges are numbers only. Please make sure that start is 0 or greater, and end is 100 or less. "
373
- )
374
- return
375
- elif int(self.start_target_range.text()) >= int(self.end_target_range.text()):
376
- show_message(
377
- fontSize=12,
378
- icon=QtWidgets.QMessageBox.Icon.Critical,
379
- title="Error",
380
- message="Please make sure that the start number is always less than the end number"
381
- )
382
- return
383
-
384
- # if they check Off-Targeting
385
- if self.find_off_Checkbox.isChecked():
386
- # make sure its a digit
387
- if self.maxOFF_comboBox.text() == '' or not self.maxOFF_comboBox.text().isdigit() and '.' not in self.maxOFF_comboBox.text():
388
- show_message(
389
- fontSize=12,
390
- icon=QtWidgets.QMessageBox.Icon.Critical,
391
- title="Error",
392
- message="Please enter only numbers for Maximum Off-Target Score. It cannot be left blank"
393
- )
394
- return
395
- else:
396
- # make sure it between 0 and .5
397
- if not 0.0 < float(self.maxOFF_comboBox.text()) <= .5:
398
- show_message(
399
- fontSize=12,
400
- icon=QtWidgets.QMessageBox.Icon.Critical,
401
- title="Error",
402
- message="Please enter a max off-target score between 0 and 0.5!"
403
- )
404
- return
405
- # compress the data, and then run off-targeting
406
- self.compress_file_off()
407
- self.get_offTarget_data(num_targets, minScore, spaceValue, output_file, fiveseq)
408
- else:
409
- # actually call the generate function
410
- did_work = self.generate(num_targets, minScore, spaceValue, output_file, fiveseq)
411
-
412
- if did_work != -1:
413
- self.cancel_function()
414
- show_message(
415
- fontSize=12,
416
- icon=QtWidgets.QMessageBox.Icon.Critical,
417
- title="Library Generated!",
418
- message="CASPER has finished generating your library!"
419
- )
420
- except Exception as e:
421
- show_error("Error in submit_data() in generate library.", e)
422
-
423
- # clears everything and hides the window
424
- def cancel_function(self):
425
- try:
426
- if self.off_target_running:
427
- msgBox = QtWidgets.QMessageBox()
428
- msgBox.setStyleSheet("font: " + str(12) + "pt 'Arial'")
429
- msgBox.setIcon(QtWidgets.QMessageBox.Icon.Question)
430
- msgBox.setWindowTitle("Off-Targeting is running")
431
- msgBox.setText(
432
- "Off-Targetting is running. Closing this window will cancel that process, and return to the main window. .\n Do you wish to continue?")
433
- msgBox.addButton(QtWidgets.QMessageBox.StandardButton.Yes)
434
- msgBox.addButton(QtWidgets.QMessageBox.StandardButton.No)
435
- msgBox.exec()
436
-
437
- if (msgBox.result() == QtWidgets.QMessageBox.No):
438
- return -2
439
- else:
440
- self.off_target_running = False
441
- self.process.kill()
442
-
443
- self.cspr_file = ''
444
- self.anno_data = list()
445
-
446
- self.filename_input.setText('')
447
- self.output_path.setText('')
448
-
449
- self.gen_lib_dict.clear()
450
- self.cspr_data.clear()
451
- self.Output.clear()
452
-
453
- self.start_target_range.setText('0')
454
- self.end_target_range.setText('100')
455
- self.space_line_edit.setText('15')
456
- self.find_off_Checkbox.setChecked(False)
457
- self.modifyParamscheckBox.setChecked(False)
458
- self.maxOFF_comboBox.setText('')
459
- self.fiveprimeseq.setText('')
460
- self.off_target_running = False
461
- self.progressBar.setValue(0)
462
-
463
- self.hide()
464
- except Exception as e:
465
- show_error("Error in cancel_function() in generate library.", e)
466
-
467
- # allows the user to browse for a folder
468
- # stores their selection in the output_path line edit
469
- def browse_function(self):
470
- try:
471
- if self.off_target_running:
472
- return
473
- # get the folder
474
- filed = QtWidgets.QFileDialog()
475
- mydir = QtWidgets.QFileDialog.getExistingDirectory(filed, "Open a Folder",
476
- GlobalSettings.CSPR_DB, QtWidgets.QFileDialog.ShowDirsOnly)
477
- if(os.path.isdir(mydir) == False):
478
- return
479
-
480
- # make sure to append the '/' to the folder path
481
- if platform.system() == "Windwos":
482
- self.output_path.setText(mydir + "\\")
483
- else:
484
- self.output_path.setText(mydir + "/")
485
- except Exception as e:
486
- show_error("Error in browse_function() in generate library.", e)
487
-
488
- # this function builds the dictionary that is used in the generate function
489
- # this is the version that builds it from data from feature_table, gbff, or gff
490
- # builds it exactly as Brian built it in the files given
491
- def build_dict_non_kegg(self):
492
- try:
493
- for tuple in self.anno_data:
494
- chrom = tuple[0]
495
- feature = tuple[1]
496
- feature_id = get_id(feature)
497
- feature_name = get_name(feature)
498
- feature_desc = get_description(feature)
499
- ### Order: chromosome number, gene start, gene end, dir of gene, gene description, gene name/locus tag
500
- self.gen_lib_dict[feature_name] = [chrom,int(feature.location.start),int(feature.location.end),get_strand(feature),get_description(feature),get_name(feature)]
501
- except Exception as e:
502
- show_error("Error in build_dict_non_kegg() in generate library.", e)
503
-
504
- # generate function taken from Brian's code
505
- def generate(self,num_targets_per_gene, score_limit, space, output_file, fiveseq):
506
- try:
507
- deletedDict = dict()
508
-
509
- # check and see if we need to search based on target_range
510
- startNum = float(self.start_target_range.text())
511
- endNum = float(self.end_target_range.text())
512
- checkStartandEndBool = False
513
- if startNum != 0.0 or endNum != 100.0:
514
- if startNum >= 0.0 and endNum <= 100.0:
515
- startNum = startNum / 100
516
- endNum = endNum / 100
517
- checkStartandEndBool = True
518
- else:
519
- show_message(
520
- fontSize=12,
521
- icon=QtWidgets.QMessageBox.Icon.Critical,
522
- title="Invalid Targeting Range:",
523
- message="Please select a targeting range between 0 and 100."
524
- )
525
- return -1
526
-
527
- for gene in self.gen_lib_dict:
528
- target_list = self.cspr_data[gene] # Gets the gRNAs for given gene
529
-
530
- #target_list = chrom_list[k:l+1]
531
- # Reverse the target list if the gene is on negative strand:
532
- if self.gen_lib_dict[gene][3] == "-":
533
- target_list.reverse()
534
-
535
- # Filter out the guides with low scores and long strings of T's
536
- # also store the ones deleted if the user selects 'modify search parameters'
537
- if self.modifyParamscheckBox.isChecked():
538
- deletedDict[gene] = list()
539
- for i in range(len(target_list) - 1, -1, -1): ### Start at end and move backwards through list
540
- # check the target_range here
541
- if int(target_list[i][3]) < int(score_limit):
542
- if self.modifyParamscheckBox.isChecked():
543
- deletedDict[gene].append(target_list[i])
544
- target_list.pop(i)
545
- # check for gRNAs with poly T regions here
546
- elif re.search("T{5,10}", target_list[i][1]) is not None:
547
- if self.modifyParamscheckBox.isChecked():
548
- deletedDict[gene].append(target_list[i])
549
- target_list.pop(i)
550
-
551
- # check for the fiveseq
552
- if fiveseq != '':
553
- for i in range(len(target_list) - 1, -1, -1): ### Start at end and move backwards through list
554
- if not target_list[i][1].startswith(fiveseq.upper()):
555
- if self.modifyParamscheckBox.isChecked():
556
- deletedDict[gene].append(target_list[i])
557
- target_list.pop(i)
558
- # check the target range here
559
- if checkStartandEndBool:
560
- for i in range(len(target_list) - 1, -1, -1):
561
- totalDistance = self.gen_lib_dict[gene][2] - self.gen_lib_dict[gene][1]
562
- target_loc = abs(int(target_list[i][0])) - int(self.gen_lib_dict[gene][1])
563
- myRatio = target_loc / totalDistance
564
-
565
- if not (startNum <= myRatio <= endNum):
566
- if self.modifyParamscheckBox.isChecked():
567
- deletedDict[gene].append(target_list[i])
568
- target_list.pop(i)
569
- # if the user selected off-targeting, check to see that the targets do not exceed the selected max score
570
- if self.find_off_Checkbox.isChecked():
571
- maxScore = float(self.maxOFF_comboBox.text())
572
- for i in range(len(target_list) - 1, -1, -1):
573
- if maxScore < float(target_list[i][5]):
574
- if self.modifyParamscheckBox.isChecked():
575
- deletedDict[gene].append(target_list[i])
576
- target_list.pop(i)
577
- # Now generating the targets
578
- self.Output[gene] = list()
579
- i = 0
580
- vec_index = 0
581
- prev_target = (0, "xyz", 'abc', 1, "-")
582
- while i < num_targets_per_gene:
583
- # select the first five targets with the score and space filter that is set in the beginning
584
- if len(target_list) == 0 or vec_index >= len(target_list):
585
- break
586
- while abs(int(target_list[vec_index][0]) - int(prev_target[0])) < int(space):
587
- if target_list[vec_index][3] > prev_target[3] and prev_target != (0,"xyz", "abc", 1, "-"):
588
- self.Output[gene].remove(prev_target)
589
- self.Output[gene].append(target_list[vec_index])
590
- prev_target = target_list[vec_index]
591
- vec_index += 1
592
- # check and see if there will be a indexing error
593
- if vec_index >= len(target_list) - 1:
594
- vec_index = vec_index - 1
595
- break
596
- # Add the new target to the output and add another to i
597
- self.Output[gene].append(target_list[vec_index])
598
- prev_target = target_list[vec_index]
599
- i += 1
600
- vec_index += 1
601
-
602
- # if the user selects modify search parameters, go through and check to see if each one has the number of targets that the user wanted
603
- # if not, append from the deletedDict until they do
604
- if self.modifyParamscheckBox.isChecked():
605
- for gene in self.Output:
606
- if len(self.Output[gene]) < num_targets_per_gene:
607
- for i in range(len(deletedDict[gene])):
608
- if len(self.Output[gene]) == num_targets_per_gene:
609
- break
610
- else:
611
- loc = deletedDict[gene][i][0]
612
- seq = deletedDict[gene][i][1]
613
- pam = deletedDict[gene][i][2]
614
- score = deletedDict[gene][i][3]
615
- strand = deletedDict[gene][i][4] + '*'
616
- endo = deletedDict[gene][i][5]
617
- self.Output[gene].append((loc, seq, pam, score, strand, endo))
618
-
619
- # Now output to the file
620
- try:
621
- f = open(output_file, 'w')
622
- # if OT checked
623
- if self.find_off_Checkbox.isChecked():
624
- f.write('Gene Name,Sequence,On-Target Score,Off-Target Score,Location,PAM,Strand\n')
625
- elif not self.find_off_Checkbox.isChecked():
626
- f.write('Gene Name,Sequence,On-Target Score,Location,PAM,Strand\n')
627
-
628
- for gene in self.Output:
629
- i = 0
630
- gene_name = self.gen_lib_dict[gene][-1]
631
- for target in self.Output[gene]:
632
- # check to see if the target did not match the user's parameters and they selected 'modify'
633
- # if the target has an error, put 2 asterisks in front of the target sequence
634
- if '*' in target[4]:
635
- tag_id = "**" + gene_name + "-" + str(i + 1)
636
- else:
637
- tag_id = gene_name + "-" + str(i + 1)
638
- i += 1
639
-
640
- tag_id = tag_id.replace(',', '')
641
-
642
- # if OT checked
643
- if self.find_off_Checkbox.isChecked():
644
- f.write(tag_id + ',' + target[1] + ',' + str(target[3]) + ',' + str(target[5]) + ',' + str(abs(int(target[0]))) + ',' + target[2] + ',' + target[4][0] + '\n')
645
- # if OT not checked
646
- elif not self.find_off_Checkbox.isChecked():
647
- f.write(tag_id + ',' + target[1] + ',' + str(target[3]) + ',' + str(abs(int(target[0]))) + ',' + target[2] + ',' + target[4][0] + '\n')
648
-
649
- f.close()
650
- except PermissionError:
651
- show_message(
652
- fontSize=12,
653
- icon=QtWidgets.QMessageBox.Icon.Critical,
654
- title="File Cannot Open",
655
- message="This file cannot be opened. Please make sure that the file is not opened elsewhere and try again."
656
- )
657
- return -1
658
- except Exception as e:
659
- print(e)
660
- return
661
- except Exception as e:
662
- show_error("Error in generate() in generate library.", e)